Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

App Files Files Community

Evgueni Poloukarov commited on 26 days ago

Commit

330e408

2 Parent(s): e1a549f b2dda0c

chore: merge with HF Space template - keep our README and requirements

Browse files

Files changed (12) hide show

.claude/settings.local.json +9 -1
.gitattributes +34 -0
Dockerfile +34 -20
README_HF_SPACE.md +115 -0
extend_dataset.py +224 -0
login.html +68 -0
on_startup.sh +4 -2
packages.txt +1 -0
process_october_features.py +388 -0
requirements_hf_space.txt +26 -0
start_server.sh +12 -13
upload_to_hf.py +158 -0

.claude/settings.local.json CHANGED Viewed

@@ -43,7 +43,15 @@
       "Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
       "WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
       "Bash(curl:*)",
-      "WebFetch(domain:www.eex-transparency.com)"
     ],
     "deny": [],
     "ask": [],

       "Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
       "WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
       "Bash(curl:*)",
+      "WebFetch(domain:www.eex-transparency.com)",
+      "Bash(cat:*)",
+      "Bash(scp:*)",
+      "Bash(git commit:*)",
+      "WebFetch(domain:www.claude.com)",
+      "Bash(xargs ls:*)",
+      "Bash(pgrep:*)",
+      "Bash(test:*)",
+      "WebFetch(domain:jupyter-docker-stacks.readthedocs.io)"
     ],
     "deny": [],
     "ask": [],

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,7 +1,7 @@
 FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive \
-    TZ=Europe/Paris
 # Remove any third-party apt sources to avoid issues with expiring keys.
 # Install some basic utilities
@@ -24,68 +24,82 @@ RUN rm -f /etc/apt/sources.list.d/*.list && \
     build-essential \
     libsndfile-dev \
     software-properties-common \
-    && rm -rf /var/lib/apt/lists/*
 RUN add-apt-repository ppa:flexiondotorg/nvtop && \
     apt-get upgrade -y && \
     apt-get install -y --no-install-recommends nvtop
-RUN curl -sL https://deb.nodesource.com/setup_21.x | bash - && \
     apt-get install -y nodejs && \
     npm install -g configurable-http-proxy
 WORKDIR /app
 RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
-    && chown -R user:user /app
 RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
 USER user
 ENV HOME=/home/user
 RUN mkdir $HOME/.cache $HOME/.config \
-    && chmod -R 777 $HOME
 ENV CONDA_AUTO_UPDATE_CONDA=false \
     PATH=$HOME/miniconda/bin:$PATH
 RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
-    && chmod +x ~/miniconda.sh \
-    && ~/miniconda.sh -b -p ~/miniconda \
-    && rm ~/miniconda.sh \
-    && conda clean -ya
 WORKDIR $HOME/app
 USER root
 RUN --mount=target=/root/packages.txt,source=packages.txt \
     apt-get update && \
     xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
-    bash /root/on_startup.sh
 RUN mkdir /data && chown user:user /data
 USER user
 RUN --mount=target=requirements.txt,source=requirements.txt \
     pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . $HOME/app
 RUN chmod +x start_server.sh
 ENV PYTHONUNBUFFERED=1 \
-    GRADIO_ALLOW_FLAGGING=never \
-    GRADIO_NUM_PORTS=1 \
-    GRADIO_SERVER_NAME=0.0.0.0 \
-    GRADIO_THEME=huggingface \
-    SYSTEM=spaces \
-    SHELL=/bin/bash
 CMD ["./start_server.sh"]

 FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive \
+	TZ=Europe/Paris
 # Remove any third-party apt sources to avoid issues with expiring keys.
 # Install some basic utilities
     build-essential \
     libsndfile-dev \
     software-properties-common \
+ && rm -rf /var/lib/apt/lists/*
 RUN add-apt-repository ppa:flexiondotorg/nvtop && \
     apt-get upgrade -y && \
     apt-get install -y --no-install-recommends nvtop
+RUN curl -sL https://deb.nodesource.com/setup_21.x  | bash - && \
     apt-get install -y nodejs && \
     npm install -g configurable-http-proxy
+# Create a working directory
 WORKDIR /app
+# Create a non-root user and switch to it
 RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+ && chown -R user:user /app
 RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
 USER user
+# All users can use /home/user as their home directory
 ENV HOME=/home/user
 RUN mkdir $HOME/.cache $HOME/.config \
+ && chmod -R 777 $HOME
+# Set up the Conda environment
 ENV CONDA_AUTO_UPDATE_CONDA=false \
     PATH=$HOME/miniconda/bin:$PATH
 RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
+ && chmod +x ~/miniconda.sh \
+ && ~/miniconda.sh -b -p ~/miniconda \
+ && rm ~/miniconda.sh \
+ && conda clean -ya
 WORKDIR $HOME/app
+#######################################
+# Start root user section
+#######################################
 USER root
+# User Debian packages
+## Security warning : Potential user code executed as root (build time)
 RUN --mount=target=/root/packages.txt,source=packages.txt \
     apt-get update && \
     xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
+	bash /root/on_startup.sh
 RUN mkdir /data && chown user:user /data
+#######################################
+# End root user section
+#######################################
 USER user
+# Python packages
 RUN --mount=target=requirements.txt,source=requirements.txt \
     pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
 RUN chmod +x start_server.sh
+COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
 ENV PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces \
+	SHELL=/bin/bash
 CMD ["./start_server.sh"]

README_HF_SPACE.md ADDED Viewed

	@@ -0,0 +1,115 @@

+---
+title: FBMC Chronos-2 Zero-Shot Forecasting
+emoji: ⚡
+colorFrom: blue
+colorTo: green
+sdk: jupyterlab
+sdk_version: "4.0.0"
+app_file: inference_smoke_test.ipynb
+pinned: false
+license: mit
+hardware: a10g-small
+---
+# FBMC Flow-Based Market Coupling Forecasting
+Zero-shot electricity cross-border flow forecasting for 38 European FBMC borders using Amazon Chronos 2.
+## 🚀 Quick Start
+This HuggingFace Space provides interactive Jupyter notebooks for running zero-shot forecasts on GPU.
+### Available Notebooks
+1. **`inference_smoke_test.ipynb`** - Quick validation (1 border × 7 days, ~1 min)
+2. **`inference_full_14day.ipynb`** - Production forecast (38 borders × 14 days, ~5 min)
+3. **`evaluation.ipynb`** - Performance analysis vs actuals
+### How to Use
+1. Open any notebook in JupyterLab
+2. Run all cells (Cell → Run All)
+3. View results and visualizations inline
+## 📊 Dataset
+**Source**: [evgueni-p/fbmc-features-24month](https://huggingface.co/datasets/evgueni-p/fbmc-features-24month)
+- **Rows**: 17,880 hourly observations
+- **Date Range**: Oct 1, 2023 - Oct 14, 2025
+- **Features**: 2,553 engineered features
+  - Weather: 375 features (52 grid points)
+  - ENTSO-E: ~1,863 features (generation, demand, prices, outages)
+  - JAO: 276 features (CNEC binding, RAM, utilization, LTA, net positions)
+  - Temporal: 39 features (hour, day, month, etc.)
+- **Targets**: 38 FBMC cross-border flows (MW)
+## 🔬 Model
+**Amazon Chronos 2 Large** (710M parameters)
+- Pre-trained foundation model for time series
+- Zero-shot inference (no fine-tuning)
+- Multivariate forecasting with future covariates
+- Dynamic time-aware data extraction (prevents leakage)
+## ⚡ Hardware
+**GPU**: NVIDIA A10G (24GB VRAM)
+- Model inference: ~5 minutes for complete 14-day forecast
+- Recommended for production workloads
+## 📈 Performance Target
+**D+1 MAE Goal**: <150 MW per border
+This is a zero-shot baseline. Fine-tuning (Phase 2) expected to improve accuracy by 20-40%.
+## 🔐 Requirements
+Set `HF_TOKEN` in Space secrets to access the private dataset.
+## 🛠️ Technical Details
+### Feature Availability Windows
+The system implements time-aware forecasting to prevent data leakage:
+- **Full-horizon D+14** (603 features): Weather, CNEC outages, LTA
+- **Partial D+1** (12 features): Load forecasts (masked D+2-D+14)
+- **Historical only** (1,899 features): Prices, generation, demand
+### Dynamic Forecast System
+Uses `DynamicForecast` module to extract context and future covariates based on run date:
+- Context window: 512 hours (historical data)
+- Forecast horizon: 336 hours (14 days)
+- Automatic masking for partial availability
+## 📚 Documentation
+- [Project Repository](https://github.com/evgspacdmy/fbmc_chronos2)
+- [Activity Log](https://github.com/evgspacdmy/fbmc_chronos2/blob/main/doc/activity.md)
+- [Feature Engineering Details](https://github.com/evgspacdmy/fbmc_chronos2/tree/main/src/feature_engineering)
+## 🔄 Phase 2 Roadmap
+Future improvements (not included in zero-shot MVP):
+- Fine-tuning on FBMC data
+- Ensemble methods
+- Probabilistic forecasting
+- Real-time data pipeline
+- Production API
+## 👤 Author
+**Evgueni Poloukarov**
+## 📄 License
+MIT License - See LICENSE file for details
+---
+**Last Updated**: 2025-11-14
+**Version**: 1.0.0 (Zero-Shot MVP)

extend_dataset.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""Extend 24-month dataset with October 2025 features.
+Merges October feature files and appends to existing 24-month unified dataset.
+Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)
+Author: Claude
+Date: 2025-11-14
+"""
+from pathlib import Path
+import polars as pl
+import sys
+def merge_october_features() -> pl.DataFrame:
+    """Merge October feature files into single dataframe."""
+    print("\n" + "=" * 80)
+    print("MERGING OCTOBER FEATURES")
+    print("=" * 80)
+    processed_dir = Path("data/processed")
+    # Load October feature files
+    weather_file = processed_dir / "features_weather_october.parquet"
+    entsoe_file = processed_dir / "features_entsoe_october.parquet"
+    jao_file = processed_dir / "features_jao_october.parquet"
+    print("\nLoading October features...")
+    weather_df = pl.read_parquet(weather_file)
+    # Cast timestamp to nanosecond precision for consistency
+    weather_df = weather_df.with_columns([
+        pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
+    ])
+    print(f"  Weather: {weather_df.shape}")
+    entsoe_df = pl.read_parquet(entsoe_file)
+    # Ensure timestamp is nanosecond precision
+    entsoe_df = entsoe_df.with_columns([
+        pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
+    ])
+    print(f"  ENTSO-E: {entsoe_df.shape}")
+    # Check if JAO features exist
+    if jao_file.exists():
+        jao_df = pl.read_parquet(jao_file)
+        print(f"  JAO: {jao_df.shape}")
+    else:
+        jao_df = None
+        print(f"  JAO: Not available (will use zeros)")
+    # Merge features
+    print("\nMerging features...")
+    unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
+    print(f"  Weather + ENTSO-E: {unified.shape}")
+    if jao_df is not None:
+        unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
+        print(f"  + JAO: {unified.shape}")
+    print(f"\n[OK] October unified features: {unified.shape}")
+    return unified
+def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
+    """Append October features to 24-month dataset."""
+    print("\n" + "=" * 80)
+    print("EXTENDING 24-MONTH DATASET")
+    print("=" * 80)
+    processed_dir = Path("data/processed")
+    base_file = processed_dir / "features_unified_24month.parquet"
+    print("\nLoading 24-month dataset...")
+    base_df = pl.read_parquet(base_file)
+    print(f"  Shape: {base_df.shape}")
+    print(f"  Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")
+    # Match October timestamp precision to base dataset
+    base_timestamp_dtype = base_df['timestamp'].dtype
+    october_features = october_features.with_columns([
+        pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
+    ])
+    print(f"  Matched timestamp precision: {base_timestamp_dtype}")
+    # Get column lists
+    base_cols = set(base_df.columns)
+    october_cols = set(october_features.columns)
+    # Find missing columns in October (JAO features likely missing)
+    missing_in_october = base_cols - october_cols
+    if missing_in_october:
+        print(f"\n  Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
+        for col in missing_in_october:
+            if col != 'timestamp':
+                october_features = october_features.with_columns([
+                    pl.lit(None).cast(base_df[col].dtype).alias(col)
+                ])
+    # Ensure ALL column dtypes match exactly (not just missing ones)
+    print("\n  Matching column dtypes...")
+    dtype_fixes = []
+    for col in base_df.columns:
+        if col in october_features.columns:
+            base_dtype = base_df[col].dtype
+            october_dtype = october_features[col].dtype
+            if base_dtype != october_dtype:
+                dtype_fixes.append(col)
+                october_features = october_features.with_columns([
+                    pl.col(col).cast(base_dtype).alias(col)
+                ])
+    if dtype_fixes:
+        print(f"  Fixed {len(dtype_fixes)} dtype mismatches")
+    # Ensure column order matches
+    october_features = october_features.select(base_df.columns)
+    print("\nAppending October features...")
+    extended_df = pl.concat([base_df, october_features], how='vertical')
+    print(f"  Extended shape: {extended_df.shape}")
+    print(f"  Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
+    print(f"  Rows added: {len(extended_df) - len(base_df)}")
+    return extended_df
+def validate_extended_dataset(extended_df: pl.DataFrame):
+    """Validate extended dataset."""
+    print("\n" + "=" * 80)
+    print("VALIDATING EXTENDED DATASET")
+    print("=" * 80)
+    expected_rows = 17880  # 24 months + 14 days
+    expected_cols = 2553   # From metadata
+    print(f"\nShape validation:")
+    print(f"  Rows: {len(extended_df)} (expected {expected_rows})")
+    print(f"  Columns: {len(extended_df.columns)} (expected {expected_cols})")
+    # Check for duplicates
+    duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
+    print(f"\nDuplicate timestamps: {len(duplicates)}")
+    # Check for gaps (skip - Duration comparison not supported in this Polars version)
+    # Just verify continuous hourly data by checking row count matches expected
+    expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
+    actual_hours = len(extended_df)
+    print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")
+    # Null counts
+    total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
+    print(f"\nTotal null values: {total_nulls}")
+    # Date range
+    date_start = extended_df['timestamp'].min()
+    date_end = extended_df['timestamp'].max()
+    print(f"\nDate range:")
+    print(f"  Start: {date_start}")
+    print(f"  End: {date_end}")
+    # Validation result
+    issues = []
+    if len(extended_df) != expected_rows:
+        issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
+    if len(duplicates) > 0:
+        issues.append(f"Found {len(duplicates)} duplicate timestamps")
+    if issues:
+        print("\n[WARNING] Validation issues:")
+        for issue in issues:
+            print(f"  - {issue}")
+        return False
+    else:
+        print("\n[OK] All validation checks passed!")
+        return True
+def main():
+    """Main execution: Merge October features and extend dataset."""
+    print("\n" + "=" * 80)
+    print("DATASET EXTENSION: October 2025")
+    print("Extending 24-month dataset (17,544 -> 17,880 rows)")
+    print("=" * 80)
+    try:
+        # Merge October features
+        october_features = merge_october_features()
+        # Extend dataset
+        extended_df = extend_dataset(october_features)
+        # Validate
+        validation_passed = validate_extended_dataset(extended_df)
+        if validation_passed:
+            # Save extended dataset
+            output_file = Path("data/processed/features_unified_extended.parquet")
+            extended_df.write_parquet(output_file)
+            print("\n" + "=" * 80)
+            print("SUCCESS: Dataset extension complete!")
+            print("=" * 80)
+            print(f"\nExtended dataset saved:")
+            print(f"  File: {output_file}")
+            print(f"  Shape: {extended_df.shape}")
+            print(f"  Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
+            print("\nNext steps:")
+            print("  1. Upload to HuggingFace Datasets")
+            print("  2. Create inference notebooks")
+            print("  3. Deploy to HF Space")
+        else:
+            print("\n[ERROR] Validation failed - please review issues")
+            sys.exit(1)
+    except Exception as e:
+        error_msg = str(e).encode('ascii', 'replace').decode('ascii')
+        print(f"\n[ERROR] Dataset extension failed: {error_msg}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

	@@ -0,0 +1,68 @@

+{% extends "page.html" %}
+{% block stylesheet %}
+{% endblock %}
+{% block site %}
+<div id="jupyter-main-app" class="container">
+    <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo">
+    <h4>Welcome to JupyterLab</h4>
+    <h5>The default token is <span style="color:orange;">huggingface</span></h5>
+    {% if login_available %}
+    {# login_available means password-login is allowed. Show the form. #}
+    <div class="row">
+        <div class="navbar col-sm-8">
+            <div class="navbar-inner">
+                <div class="container">
+                    <div class="center-nav">
+                        <form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
+                            {{ xsrf_form_html() | safe }}
+                            {% if token_available %}
+                            <label for="password_input"><strong>{% trans %}Jupyter token <span title="This is the secret you set up when deploying your JupyterLab space">ⓘ</span> {% endtrans
+                                    %}</strong></label>
+                            {% else %}
+                            <label for="password_input"><strong>{% trans %}Jupyter password:{% endtrans %}</strong></label>
+                            {% endif %}
+                            <input type="password" name="password" id="password_input" class="form-control">
+                            <button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
+                                %}</button>
+                        </form>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    {% else %}
+    <p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
+    {% endif %}
+      <h5>If you don't have the credentials for this Jupyter space, <a target="_blank" href="https://huggingface.co/spaces/SpacesExamples/jupyterlab?duplicate=true">create your own.</a></h5>
+    <br>
+    <p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a> and <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> </p>
+    {% if message %}
+    <div class="row">
+        {% for key in message %}
+        <div class="message {{key}}">
+            {{message[key]}}
+        </div>
+        {% endfor %}
+    </div>
+    {% endif %}
+    {% if token_available %}
+    {% block token_message %}
+    {% endblock token_message %}
+    {% endif %}
+</div>
+{% endblock %}
+{% block script %}
+{% endblock %}

on_startup.sh CHANGED Viewed

@@ -1,3 +1,5 @@
 #!/bin/bash
-# Startup script - runs as root before container starts

 #!/bin/bash
+# Write some commands here that will run on root user before startup.
+# For example, to clone transformers and install it in dev mode:
+# git clone https://github.com/huggingface/transformers.git
+# cd transformers && pip install -e ".[dev]"

packages.txt CHANGED Viewed

	@@ -0,0 +1 @@


1	+ tree

process_october_features.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""Process October 2025 raw data into features for dataset extension.
+This script processes the October 2025 raw data (downloaded Nov 13) and generates
+feature files matching the 24-month dataset schema:
+- Weather features: 375 features
+- ENTSO-E features: ~1,863 features
+- JAO features: 276 features (if October data exists)
+Output files will be saved to data/processed/ with "_october" suffix.
+Author: Claude
+Date: 2025-11-14
+"""
+from pathlib import Path
+import polars as pl
+import sys
+# Add src to path for imports
+sys.path.append(str(Path(__file__).parent / "src"))
+from feature_engineering.engineer_weather_features import (
+    engineer_grid_level_features,
+    engineer_temporal_lags,
+    engineer_derived_features
+)
+from feature_engineering.engineer_entsoe_features import (
+    engineer_generation_features,
+    engineer_demand_features,
+    engineer_price_features,
+    engineer_hydro_storage_features,
+    engineer_pumped_storage_features,
+    engineer_load_forecast_features,
+    engineer_transmission_outage_features
+)
+def process_october_weather() -> pl.DataFrame:
+    """Process October weather data into 375 features."""
+    print("\n" + "=" * 80)
+    print("PROCESSING OCTOBER WEATHER DATA")
+    print("=" * 80)
+    raw_file = Path("data/raw/weather_october_2025.parquet")
+    if not raw_file.exists():
+        raise FileNotFoundError(f"Missing: {raw_file}")
+    # Load October weather data
+    weather_df = pl.read_parquet(raw_file)
+    print(f"\nLoaded weather data: {weather_df.shape}")
+    print(f"Date range: {weather_df['timestamp'].min()} to {weather_df['timestamp'].max()}")
+    # Engineer features using existing modules
+    features = engineer_grid_level_features(weather_df)
+    features = engineer_temporal_lags(features)
+    features = engineer_derived_features(features)
+    # Save to processed directory
+    output_file = Path("data/processed/features_weather_october.parquet")
+    features.write_parquet(output_file)
+    print(f"\n[OK] Weather features saved: {output_file}")
+    print(f"  Shape: {features.shape}")
+    print(f"  Features: {len(features.columns) - 1} (+ timestamp)")
+    return features
+def process_october_entsoe() -> pl.DataFrame:
+    """Process October ENTSO-E data into ~1,863 features."""
+    print("\n" + "=" * 80)
+    print("PROCESSING OCTOBER ENTSO-E DATA")
+    print("=" * 80)
+    # Check which ENTSO-E files exist
+    raw_dir = Path("data/raw")
+    processed_dir = Path("data/processed")
+    required_files = {
+        'generation': raw_dir / "entsoe_generation_october_2025.parquet",
+        'demand': raw_dir / "entsoe_demand_october_2025.parquet",
+        'prices': raw_dir / "entsoe_prices_october_2025.parquet",
+        'hydro_storage': raw_dir / "entsoe_hydro_storage_october_2025.parquet",
+        'pumped_storage': raw_dir / "entsoe_pumped_storage_october_2025.parquet",
+        'load_forecast': raw_dir / "entsoe_load_forecast_october_2025.parquet",
+        'transmission_outages': raw_dir / "entsoe_transmission_outages_october_2025.parquet"
+    }
+    # Load CNEC master list (required for transmission outage features)
+    cnec_master_path = processed_dir / "cnecs_master_176.csv"
+    if not cnec_master_path.exists():
+        raise FileNotFoundError(f"Missing CNEC master list: {cnec_master_path}")
+    cnec_master_df = pl.read_csv(cnec_master_path)
+    print(f"\nLoaded CNEC master list: {cnec_master_df.shape}")
+    # Verify all files exist
+    for name, file_path in required_files.items():
+        if not file_path.exists():
+            print(f"WARNING: Missing {name} file: {file_path}")
+    # Load all datasets
+    print("\nLoading ENTSO-E datasets...")
+    generation_df = pl.read_parquet(required_files['generation'])
+    demand_df = pl.read_parquet(required_files['demand'])
+    prices_df = pl.read_parquet(required_files['prices'])
+    hydro_storage_df = pl.read_parquet(required_files['hydro_storage'])
+    pumped_storage_df = pl.read_parquet(required_files['pumped_storage'])
+    load_forecast_df = pl.read_parquet(required_files['load_forecast'])
+    transmission_outages_df = pl.read_parquet(required_files['transmission_outages'])
+    print(f"  Generation: {generation_df.shape}")
+    print(f"  Demand: {demand_df.shape}")
+    print(f"  Prices: {prices_df.shape}")
+    print(f"  Hydro storage: {hydro_storage_df.shape}")
+    print(f"  Pumped storage: {pumped_storage_df.shape}")
+    print(f"  Load forecast: {load_forecast_df.shape}")
+    print(f"  Transmission outages: {transmission_outages_df.shape}")
+    # Engineer features for each category
+    print("\nEngineering ENTSO-E features...")
+    # Generation features (~228 features)
+    gen_features = engineer_generation_features(generation_df)
+    # Demand features (24 features)
+    demand_features = engineer_demand_features(demand_df)
+    # Price features (24 features)
+    price_features = engineer_price_features(prices_df)
+    # Hydro storage features (12 features)
+    hydro_features = engineer_hydro_storage_features(hydro_storage_df)
+    # Pumped storage features (10 features)
+    pumped_features = engineer_pumped_storage_features(pumped_storage_df)
+    # Load forecast features (12 features)
+    load_forecast_features = engineer_load_forecast_features(load_forecast_df)
+    # Transmission outage features (176 features - ALL CNECs)
+    # Create hourly range for October (Oct 1-14, 2025)
+    import datetime
+    october_start = datetime.datetime(2025, 10, 1, 0, 0)
+    october_end = datetime.datetime(2025, 10, 14, 23, 0)
+    hourly_range = pl.DataFrame({
+        'timestamp': pl.datetime_range(
+            october_start,
+            october_end,
+            interval='1h',
+            eager=True
+        )
+    })
+    transmission_features = engineer_transmission_outage_features(
+        transmission_outages_df,
+        cnec_master_df,
+        hourly_range
+    )
+    # Merge all features
+    print("\nMerging all ENTSO-E features...")
+    features = gen_features
+    # Fix timezone and precision issues - ensure all timestamps are timezone-naive and nanosecond precision
+    features = features.with_columns([
+        pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
+    ])
+    for feat_df, name in [
+        (demand_features, "demand"),
+        (price_features, "prices"),
+        (hydro_features, "hydro_storage"),
+        (pumped_features, "pumped_storage"),
+        (load_forecast_features, "load_forecast"),
+        (transmission_features, "transmission_outages")
+    ]:
+        # Ensure timezone and precision consistency
+        if 'timestamp' in feat_df.columns:
+            feat_df = feat_df.with_columns([
+                pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
+            ])
+        features = features.join(feat_df, on='timestamp', how='left', coalesce=True)
+        print(f"  Added {name}: {len(feat_df.columns) - 1} features")
+    # Resample to hourly (some datasets have sub-hourly data)
+    print("\nResampling to hourly...")
+    features = features.with_columns([
+        pl.col('timestamp').dt.truncate('1h').alias('timestamp')
+    ])
+    # Group by hour and take mean (for any sub-hourly values)
+    agg_exprs = [pl.col(c).mean().alias(c) for c in features.columns if c != 'timestamp']
+    features = features.group_by('timestamp').agg(agg_exprs).sort('timestamp')
+    print(f"  Resampled to {len(features)} hourly rows")
+    # Ensure complete 336-hour range (Oct 1-14) - fill missing hours with forward-fill
+    october_start = datetime.datetime(2025, 10, 1, 0, 0)
+    october_end = datetime.datetime(2025, 10, 14, 23, 0)
+    complete_range = pl.DataFrame({
+        'timestamp': pl.datetime_range(
+            october_start,
+            october_end,
+            interval='1h',
+            eager=True
+        )
+    })
+    # Cast complete_range timestamp to match features precision
+    complete_range = complete_range.with_columns([
+        pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
+    ])
+    # Join to complete range and forward-fill missing values
+    features = complete_range.join(features, on='timestamp', how='left')
+    # Forward-fill missing values
+    fill_exprs = []
+    for col in features.columns:
+        if col != 'timestamp':
+            fill_exprs.append(pl.col(col).forward_fill().alias(col))
+    if fill_exprs:
+        features = features.with_columns(fill_exprs)
+    missing_count = 336 - len(features.filter(pl.all_horizontal(pl.all().is_not_null())))
+    if missing_count > 0:
+        print(f"  Forward-filled {missing_count} missing hours")
+    print(f"  Final shape: {len(features)} hourly rows (Oct 1-14)")
+    # Save to processed directory
+    output_file = Path("data/processed/features_entsoe_october.parquet")
+    features.write_parquet(output_file)
+    print(f"\n[OK] ENTSO-E features saved: {output_file}")
+    print(f"  Shape: {features.shape}")
+    print(f"  Features: {len(features.columns) - 1} (+ timestamp)")
+    return features
+def process_october_jao() -> pl.DataFrame | None:
+    """Process October JAO data into 276 features (if data exists)."""
+    print("\n" + "=" * 80)
+    print("PROCESSING OCTOBER JAO DATA")
+    print("=" * 80)
+    # Check if October JAO data exists
+    raw_file = Path("data/raw/jao_october_2025.parquet")
+    if not raw_file.exists():
+        print(f"\nINFO: No October JAO data found at {raw_file}")
+        print("This is expected - JAO features may be historical only.")
+        print("Skipping JAO feature engineering for October.")
+        return None
+    # If data exists, process it
+    from feature_engineering.engineer_jao_features import (
+        engineer_jao_features_all
+    )
+    jao_df = pl.read_parquet(raw_file)
+    print(f"\nLoaded JAO data: {jao_df.shape}")
+    features = engineer_jao_features_all(jao_df)
+    # Save to processed directory
+    output_file = Path("data/processed/features_jao_october.parquet")
+    features.write_parquet(output_file)
+    print(f"\n[OK] JAO features saved: {output_file}")
+    print(f"  Shape: {features.shape}")
+    return features
+def validate_october_features():
+    """Validate October feature files match expected schema."""
+    print("\n" + "=" * 80)
+    print("VALIDATING OCTOBER FEATURES")
+    print("=" * 80)
+    # Load October feature files
+    weather_file = Path("data/processed/features_weather_october.parquet")
+    entsoe_file = Path("data/processed/features_entsoe_october.parquet")
+    jao_file = Path("data/processed/features_jao_october.parquet")
+    weather_df = pl.read_parquet(weather_file)
+    entsoe_df = pl.read_parquet(entsoe_file)
+    print(f"\nWeather features: {weather_df.shape}")
+    print(f"  Rows (expected 336): {len(weather_df)}")
+    print(f"  Features (expected 375): {len(weather_df.columns) - 1}")
+    print(f"\nENTSO-E features: {entsoe_df.shape}")
+    print(f"  Rows (expected 336): {len(entsoe_df)}")
+    print(f"  Features (expected ~1,863): {len(entsoe_df.columns) - 1}")
+    if jao_file.exists():
+        jao_df = pl.read_parquet(jao_file)
+        print(f"\nJAO features: {jao_df.shape}")
+        print(f"  Rows (expected 336): {len(jao_df)}")
+        print(f"  Features (expected 276): {len(jao_df.columns) - 1}")
+    else:
+        print("\nJAO features: Not generated (no October JAO data)")
+    # Validate row count (14 days × 24 hours = 336)
+    expected_rows = 336
+    issues = []
+    if len(weather_df) != expected_rows:
+        issues.append(f"Weather rows: {len(weather_df)} (expected {expected_rows})")
+    if len(entsoe_df) != expected_rows:
+        issues.append(f"ENTSO-E rows: {len(entsoe_df)} (expected {expected_rows})")
+    # Validate date range (Oct 1-14, 2025)
+    weather_start = weather_df['timestamp'].min()
+    weather_end = weather_df['timestamp'].max()
+    entsoe_start = entsoe_df['timestamp'].min()
+    entsoe_end = entsoe_df['timestamp'].max()
+    print(f"\nDate ranges:")
+    print(f"  Weather: {weather_start} to {weather_end}")
+    print(f"  ENTSO-E: {entsoe_start} to {entsoe_end}")
+    # Check for null values
+    weather_nulls = weather_df.null_count().sum_horizontal().to_list()[0]
+    entsoe_nulls = entsoe_df.null_count().sum_horizontal().to_list()[0]
+    print(f"\nNull value counts:")
+    print(f"  Weather: {weather_nulls} nulls")
+    print(f"  ENTSO-E: {entsoe_nulls} nulls")
+    # Report validation results
+    if issues:
+        print("\n[WARNING] Validation issues found:")
+        for issue in issues:
+            print(f"  - {issue}")
+    else:
+        print("\n[OK] All validation checks passed!")
+    return len(issues) == 0
+def main():
+    """Main execution: Process all October data."""
+    print("\n" + "=" * 80)
+    print("OCTOBER 2025 FEATURE ENGINEERING")
+    print("Processing raw data into features for dataset extension")
+    print("=" * 80)
+    try:
+        # Process each feature category
+        weather_features = process_october_weather()
+        entsoe_features = process_october_entsoe()
+        jao_features = process_october_jao()  # May return None
+        # Validate features
+        validation_passed = validate_october_features()
+        if validation_passed:
+            print("\n" + "=" * 80)
+            print("SUCCESS: October feature engineering complete!")
+            print("=" * 80)
+            print("\nGenerated files:")
+            print("  - data/processed/features_weather_october.parquet")
+            print("  - data/processed/features_entsoe_october.parquet")
+            if jao_features is not None:
+                print("  - data/processed/features_jao_october.parquet")
+            print("\nNext steps:")
+            print("  1. Merge October features into unified dataset")
+            print("  2. Append to 24-month dataset (17,544 -> 17,880 rows)")
+            print("  3. Upload extended dataset to HuggingFace")
+        else:
+            print("\n[ERROR] Validation failed - please review issues above")
+            sys.exit(1)
+    except Exception as e:
+        # Avoid Unicode errors on Windows console
+        error_msg = str(e).encode('ascii', 'replace').decode('ascii')
+        print(f"\n[ERROR] Feature engineering failed: {error_msg}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements_hf_space.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# HuggingFace Space Requirements for FBMC Chronos-2 Forecasting
+# GPU-optimized dependencies for JupyterLab SDK
+# Core ML/Data
+torch>=2.0.0
+transformers>=4.35.0
+chronos-forecasting>=1.2.0
+datasets>=2.14.0
+polars>=0.19.0
+pyarrow>=13.0.0
+# HuggingFace
+huggingface-hub>=0.19.0
+# Visualization
+altair>=5.0.0
+vega-datasets
+# Jupyter
+ipykernel
+jupyter
+jupyterlab
+# Utilities
+python-dotenv
+tqdm

start_server.sh CHANGED Viewed

@@ -1,20 +1,19 @@
 #!/bin/bash
 JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
-NOTEBOOK_DIR="/home/user/app"
 jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
 jupyter-lab \
---ip 0.0.0.0 \
---port 7860 \
---no-browser \
---allow-root \
---ServerApp.token="$JUPYTER_TOKEN" \
---ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
---ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
---ServerApp.disable_check_xsrf=True \
---LabApp.news_url=None \
---LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
---notebook-dir=$NOTEBOOK_DIR

 #!/bin/bash
 JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
+NOTEBOOK_DIR="/data"
 jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
 jupyter-lab \
+    --ip 0.0.0.0 \
+    --port 7860 \
+    --no-browser \
+    --allow-root \
+    --ServerApp.token="$JUPYTER_TOKEN" \
+    --ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
+    --ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
+    --ServerApp.disable_check_xsrf=True \
+    --LabApp.news_url=None \
+    --LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
+    --notebook-dir=$NOTEBOOK_DIR

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""Upload extended dataset to HuggingFace Datasets.
+Uploads features_unified_extended.parquet (17,880 rows) to replace existing
+24-month dataset (17,544 rows) on HuggingFace.
+Dataset: evgueni-p/fbmc-features-24month
+New date range: Oct 1, 2023 - Oct 14, 2025
+Author: Claude
+Date: 2025-11-14
+"""
+from pathlib import Path
+import os
+from datasets import Dataset
+import polars as pl
+from huggingface_hub import login
+import sys
+# Load environment variables from .env file
+from dotenv import load_dotenv
+load_dotenv()
+def upload_extended_dataset():
+    """Upload extended dataset to HuggingFace."""
+    print("\n" + "=" * 80)
+    print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
+    print("=" * 80)
+    # Load HF token
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        raise ValueError("HF_TOKEN environment variable not set - check .env file")
+    # Login to HuggingFace
+    print("\nAuthenticating with HuggingFace...")
+    login(token=hf_token)
+    print("  [OK] Logged in")
+    # Load extended dataset
+    extended_file = Path("data/processed/features_unified_extended.parquet")
+    if not extended_file.exists():
+        raise FileNotFoundError(f"Extended dataset not found: {extended_file}")
+    print(f"\nLoading extended dataset...")
+    df = pl.read_parquet(extended_file)
+    print(f"  Shape: {df.shape}")
+    print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
+    print(f"  File size: {extended_file.stat().st_size / 1024 / 1024:.1f} MB")
+    # Convert to HuggingFace Dataset
+    print("\nConverting to HuggingFace Dataset format...")
+    hf_dataset = Dataset.from_polars(df)
+    print(f"  [OK] Converted: {hf_dataset}")
+    # Upload to HuggingFace
+    dataset_name = "evgueni-p/fbmc-features-24month"
+    print(f"\nUploading to HuggingFace: {dataset_name}")
+    print("  This may take a few minutes...")
+    hf_dataset.push_to_hub(
+        dataset_name,
+        token=hf_token,
+        private=False  # Make public
+    )
+    print(f"\n[OK] Dataset uploaded successfully!")
+    print(f"  URL: https://huggingface.co/datasets/{dataset_name}")
+    print(f"  Rows: {len(hf_dataset)}")
+    print(f"  Columns: {len(hf_dataset.column_names)}")
+    return dataset_name
+def verify_upload(dataset_name: str):
+    """Verify uploaded dataset by downloading and checking shape."""
+    print("\n" + "=" * 80)
+    print("VERIFYING UPLOAD")
+    print("=" * 80)
+    from datasets import load_dataset
+    hf_token = os.getenv("HF_TOKEN")
+    print(f"\nDownloading dataset from HuggingFace...")
+    print(f"  Dataset: {dataset_name}")
+    downloaded = load_dataset(
+        dataset_name,
+        split="train",
+        token=hf_token
+    )
+    print(f"\n[OK] Downloaded successfully!")
+    print(f"  Shape: {downloaded.shape}")
+    # Convert to Polars for inspection
+    df_check = pl.from_arrow(downloaded.data.table)
+    print(f"  Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")
+    # Validate
+    expected_rows = 17880
+    expected_cols = 2553
+    issues = []
+    if downloaded.shape[0] != expected_rows:
+        issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
+    if downloaded.shape[1] != expected_cols:
+        issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")
+    if issues:
+        print("\n[WARNING] Validation issues:")
+        for issue in issues:
+            print(f"  - {issue}")
+        return False
+    else:
+        print("\n[OK] Upload verified successfully!")
+        return True
+def main():
+    """Main execution: Upload and verify extended dataset."""
+    print("\n" + "=" * 80)
+    print("HUGGINGFACE DATASET UPLOAD")
+    print("Uploading extended dataset (17,880 rows)")
+    print("=" * 80)
+    try:
+        # Upload dataset
+        dataset_name = upload_extended_dataset()
+        # Verify upload
+        verification_passed = verify_upload(dataset_name)
+        if verification_passed:
+            print("\n" + "=" * 80)
+            print("SUCCESS: Dataset uploaded and verified!")
+            print("=" * 80)
+            print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
+            print("\nNext steps:")
+            print("  1. Create inference notebooks (.ipynb)")
+            print("  2. Create HF Space README.md")
+            print("  3. Deploy notebooks to HF Space")
+            print("  4. Test inference on GPU")
+        else:
+            print("\n[ERROR] Verification failed")
+            sys.exit(1)
+    except Exception as e:
+        error_msg = str(e).encode('ascii', 'replace').decode('ascii')
+        print(f"\n[ERROR] Upload failed: {error_msg}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()