Evgueni Poloukarov commited on
Commit
330e408
·
2 Parent(s): e1a549f b2dda0c

chore: merge with HF Space template - keep our README and requirements

Browse files
.claude/settings.local.json CHANGED
@@ -43,7 +43,15 @@
43
  "Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
44
  "WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
45
  "Bash(curl:*)",
46
- "WebFetch(domain:www.eex-transparency.com)"
 
 
 
 
 
 
 
 
47
  ],
48
  "deny": [],
49
  "ask": [],
 
43
  "Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
44
  "WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
45
  "Bash(curl:*)",
46
+ "WebFetch(domain:www.eex-transparency.com)",
47
+ "Bash(cat:*)",
48
+ "Bash(scp:*)",
49
+ "Bash(git commit:*)",
50
+ "WebFetch(domain:www.claude.com)",
51
+ "Bash(xargs ls:*)",
52
+ "Bash(pgrep:*)",
53
+ "Bash(test:*)",
54
+ "WebFetch(domain:jupyter-docker-stacks.readthedocs.io)"
55
  ],
56
  "deny": [],
57
  "ask": [],
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive \
4
- TZ=Europe/Paris
5
 
6
  # Remove any third-party apt sources to avoid issues with expiring keys.
7
  # Install some basic utilities
@@ -24,68 +24,82 @@ RUN rm -f /etc/apt/sources.list.d/*.list && \
24
  build-essential \
25
  libsndfile-dev \
26
  software-properties-common \
27
- && rm -rf /var/lib/apt/lists/*
28
 
29
  RUN add-apt-repository ppa:flexiondotorg/nvtop && \
30
  apt-get upgrade -y && \
31
  apt-get install -y --no-install-recommends nvtop
32
 
33
- RUN curl -sL https://deb.nodesource.com/setup_21.x | bash - && \
34
  apt-get install -y nodejs && \
35
  npm install -g configurable-http-proxy
36
 
 
37
  WORKDIR /app
38
 
 
39
  RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
40
- && chown -R user:user /app
41
-
42
  RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
43
-
44
  USER user
45
 
 
46
  ENV HOME=/home/user
47
-
48
  RUN mkdir $HOME/.cache $HOME/.config \
49
- && chmod -R 777 $HOME
50
 
 
51
  ENV CONDA_AUTO_UPDATE_CONDA=false \
52
  PATH=$HOME/miniconda/bin:$PATH
53
-
54
  RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
55
- && chmod +x ~/miniconda.sh \
56
- && ~/miniconda.sh -b -p ~/miniconda \
57
- && rm ~/miniconda.sh \
58
- && conda clean -ya
59
 
60
  WORKDIR $HOME/app
61
 
 
 
 
 
62
  USER root
63
 
 
 
64
  RUN --mount=target=/root/packages.txt,source=packages.txt \
65
  apt-get update && \
66
  xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
67
  && rm -rf /var/lib/apt/lists/*
68
 
69
  RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
70
- bash /root/on_startup.sh
71
 
72
  RUN mkdir /data && chown user:user /data
73
 
 
 
 
 
74
  USER user
75
 
 
76
  RUN --mount=target=requirements.txt,source=requirements.txt \
77
  pip install --no-cache-dir --upgrade -r requirements.txt
78
 
 
79
  COPY --chown=user . $HOME/app
80
 
81
  RUN chmod +x start_server.sh
82
 
 
 
83
  ENV PYTHONUNBUFFERED=1 \
84
- GRADIO_ALLOW_FLAGGING=never \
85
- GRADIO_NUM_PORTS=1 \
86
- GRADIO_SERVER_NAME=0.0.0.0 \
87
- GRADIO_THEME=huggingface \
88
- SYSTEM=spaces \
89
- SHELL=/bin/bash
90
 
91
  CMD ["./start_server.sh"]
 
1
  FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive \
4
+ TZ=Europe/Paris
5
 
6
  # Remove any third-party apt sources to avoid issues with expiring keys.
7
  # Install some basic utilities
 
24
  build-essential \
25
  libsndfile-dev \
26
  software-properties-common \
27
+ && rm -rf /var/lib/apt/lists/*
28
 
29
  RUN add-apt-repository ppa:flexiondotorg/nvtop && \
30
  apt-get upgrade -y && \
31
  apt-get install -y --no-install-recommends nvtop
32
 
33
+ RUN curl -sL https://deb.nodesource.com/setup_21.x | bash - && \
34
  apt-get install -y nodejs && \
35
  npm install -g configurable-http-proxy
36
 
37
+ # Create a working directory
38
  WORKDIR /app
39
 
40
+ # Create a non-root user and switch to it
41
  RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
42
+ && chown -R user:user /app
 
43
  RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
 
44
  USER user
45
 
46
+ # All users can use /home/user as their home directory
47
  ENV HOME=/home/user
 
48
  RUN mkdir $HOME/.cache $HOME/.config \
49
+ && chmod -R 777 $HOME
50
 
51
+ # Set up the Conda environment
52
  ENV CONDA_AUTO_UPDATE_CONDA=false \
53
  PATH=$HOME/miniconda/bin:$PATH
 
54
  RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
55
+ && chmod +x ~/miniconda.sh \
56
+ && ~/miniconda.sh -b -p ~/miniconda \
57
+ && rm ~/miniconda.sh \
58
+ && conda clean -ya
59
 
60
  WORKDIR $HOME/app
61
 
62
+ #######################################
63
+ # Start root user section
64
+ #######################################
65
+
66
  USER root
67
 
68
+ # User Debian packages
69
+ ## Security warning : Potential user code executed as root (build time)
70
  RUN --mount=target=/root/packages.txt,source=packages.txt \
71
  apt-get update && \
72
  xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
73
  && rm -rf /var/lib/apt/lists/*
74
 
75
  RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
76
+ bash /root/on_startup.sh
77
 
78
  RUN mkdir /data && chown user:user /data
79
 
80
+ #######################################
81
+ # End root user section
82
+ #######################################
83
+
84
  USER user
85
 
86
+ # Python packages
87
  RUN --mount=target=requirements.txt,source=requirements.txt \
88
  pip install --no-cache-dir --upgrade -r requirements.txt
89
 
90
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
91
  COPY --chown=user . $HOME/app
92
 
93
  RUN chmod +x start_server.sh
94
 
95
+ COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
96
+
97
  ENV PYTHONUNBUFFERED=1 \
98
+ GRADIO_ALLOW_FLAGGING=never \
99
+ GRADIO_NUM_PORTS=1 \
100
+ GRADIO_SERVER_NAME=0.0.0.0 \
101
+ GRADIO_THEME=huggingface \
102
+ SYSTEM=spaces \
103
+ SHELL=/bin/bash
104
 
105
  CMD ["./start_server.sh"]
README_HF_SPACE.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FBMC Chronos-2 Zero-Shot Forecasting
3
+ emoji: ⚡
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: jupyterlab
7
+ sdk_version: "4.0.0"
8
+ app_file: inference_smoke_test.ipynb
9
+ pinned: false
10
+ license: mit
11
+ hardware: a10g-small
12
+ ---
13
+
14
+ # FBMC Flow-Based Market Coupling Forecasting
15
+
16
+ Zero-shot electricity cross-border flow forecasting for 38 European FBMC borders using Amazon Chronos 2.
17
+
18
+ ## 🚀 Quick Start
19
+
20
+ This HuggingFace Space provides interactive Jupyter notebooks for running zero-shot forecasts on GPU.
21
+
22
+ ### Available Notebooks
23
+
24
+ 1. **`inference_smoke_test.ipynb`** - Quick validation (1 border × 7 days, ~1 min)
25
+ 2. **`inference_full_14day.ipynb`** - Production forecast (38 borders × 14 days, ~5 min)
26
+ 3. **`evaluation.ipynb`** - Performance analysis vs actuals
27
+
28
+ ### How to Use
29
+
30
+ 1. Open any notebook in JupyterLab
31
+ 2. Run all cells (Cell → Run All)
32
+ 3. View results and visualizations inline
33
+
34
+ ## 📊 Dataset
35
+
36
+ **Source**: [evgueni-p/fbmc-features-24month](https://huggingface.co/datasets/evgueni-p/fbmc-features-24month)
37
+
38
+ - **Rows**: 17,880 hourly observations
39
+ - **Date Range**: Oct 1, 2023 - Oct 14, 2025
40
+ - **Features**: 2,553 engineered features
41
+ - Weather: 375 features (52 grid points)
42
+ - ENTSO-E: ~1,863 features (generation, demand, prices, outages)
43
+ - JAO: 276 features (CNEC binding, RAM, utilization, LTA, net positions)
44
+ - Temporal: 39 features (hour, day, month, etc.)
45
+
46
+ - **Targets**: 38 FBMC cross-border flows (MW)
47
+
48
+ ## 🔬 Model
49
+
50
+ **Amazon Chronos 2 Large** (710M parameters)
51
+ - Pre-trained foundation model for time series
52
+ - Zero-shot inference (no fine-tuning)
53
+ - Multivariate forecasting with future covariates
54
+ - Dynamic time-aware data extraction (prevents leakage)
55
+
56
+ ## ⚡ Hardware
57
+
58
+ **GPU**: NVIDIA A10G (24GB VRAM)
59
+ - Model inference: ~5 minutes for complete 14-day forecast
60
+ - Recommended for production workloads
61
+
62
+ ## 📈 Performance Target
63
+
64
+ **D+1 MAE Goal**: <150 MW per border
65
+
66
+ This is a zero-shot baseline. Fine-tuning (Phase 2) expected to improve accuracy by 20-40%.
67
+
68
+ ## 🔐 Requirements
69
+
70
+ Set `HF_TOKEN` in Space secrets to access the private dataset.
71
+
72
+ ## 🛠️ Technical Details
73
+
74
+ ### Feature Availability Windows
75
+
76
+ The system implements time-aware forecasting to prevent data leakage:
77
+
78
+ - **Full-horizon D+14** (603 features): Weather, CNEC outages, LTA
79
+ - **Partial D+1** (12 features): Load forecasts (masked D+2-D+14)
80
+ - **Historical only** (1,899 features): Prices, generation, demand
81
+
82
+ ### Dynamic Forecast System
83
+
84
+ Uses `DynamicForecast` module to extract context and future covariates based on run date:
85
+ - Context window: 512 hours (historical data)
86
+ - Forecast horizon: 336 hours (14 days)
87
+ - Automatic masking for partial availability
88
+
89
+ ## 📚 Documentation
90
+
91
+ - [Project Repository](https://github.com/evgspacdmy/fbmc_chronos2)
92
+ - [Activity Log](https://github.com/evgspacdmy/fbmc_chronos2/blob/main/doc/activity.md)
93
+ - [Feature Engineering Details](https://github.com/evgspacdmy/fbmc_chronos2/tree/main/src/feature_engineering)
94
+
95
+ ## 🔄 Phase 2 Roadmap
96
+
97
+ Future improvements (not included in zero-shot MVP):
98
+ - Fine-tuning on FBMC data
99
+ - Ensemble methods
100
+ - Probabilistic forecasting
101
+ - Real-time data pipeline
102
+ - Production API
103
+
104
+ ## 👤 Author
105
+
106
+ **Evgueni Poloukarov**
107
+
108
+ ## 📄 License
109
+
110
+ MIT License - See LICENSE file for details
111
+
112
+ ---
113
+
114
+ **Last Updated**: 2025-11-14
115
+ **Version**: 1.0.0 (Zero-Shot MVP)
extend_dataset.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extend 24-month dataset with October 2025 features.
2
+
3
+ Merges October feature files and appends to existing 24-month unified dataset.
4
+ Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)
5
+
6
+ Author: Claude
7
+ Date: 2025-11-14
8
+ """
9
+ from pathlib import Path
10
+ import polars as pl
11
+ import sys
12
+
13
+
14
+ def merge_october_features() -> pl.DataFrame:
15
+ """Merge October feature files into single dataframe."""
16
+ print("\n" + "=" * 80)
17
+ print("MERGING OCTOBER FEATURES")
18
+ print("=" * 80)
19
+
20
+ processed_dir = Path("data/processed")
21
+
22
+ # Load October feature files
23
+ weather_file = processed_dir / "features_weather_october.parquet"
24
+ entsoe_file = processed_dir / "features_entsoe_october.parquet"
25
+ jao_file = processed_dir / "features_jao_october.parquet"
26
+
27
+ print("\nLoading October features...")
28
+ weather_df = pl.read_parquet(weather_file)
29
+ # Cast timestamp to nanosecond precision for consistency
30
+ weather_df = weather_df.with_columns([
31
+ pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
32
+ ])
33
+ print(f" Weather: {weather_df.shape}")
34
+
35
+ entsoe_df = pl.read_parquet(entsoe_file)
36
+ # Ensure timestamp is nanosecond precision
37
+ entsoe_df = entsoe_df.with_columns([
38
+ pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
39
+ ])
40
+ print(f" ENTSO-E: {entsoe_df.shape}")
41
+
42
+ # Check if JAO features exist
43
+ if jao_file.exists():
44
+ jao_df = pl.read_parquet(jao_file)
45
+ print(f" JAO: {jao_df.shape}")
46
+ else:
47
+ jao_df = None
48
+ print(f" JAO: Not available (will use zeros)")
49
+
50
+ # Merge features
51
+ print("\nMerging features...")
52
+ unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
53
+ print(f" Weather + ENTSO-E: {unified.shape}")
54
+
55
+ if jao_df is not None:
56
+ unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
57
+ print(f" + JAO: {unified.shape}")
58
+
59
+ print(f"\n[OK] October unified features: {unified.shape}")
60
+ return unified
61
+
62
+
63
+ def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
64
+ """Append October features to 24-month dataset."""
65
+ print("\n" + "=" * 80)
66
+ print("EXTENDING 24-MONTH DATASET")
67
+ print("=" * 80)
68
+
69
+ processed_dir = Path("data/processed")
70
+ base_file = processed_dir / "features_unified_24month.parquet"
71
+
72
+ print("\nLoading 24-month dataset...")
73
+ base_df = pl.read_parquet(base_file)
74
+ print(f" Shape: {base_df.shape}")
75
+ print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")
76
+
77
+ # Match October timestamp precision to base dataset
78
+ base_timestamp_dtype = base_df['timestamp'].dtype
79
+ october_features = october_features.with_columns([
80
+ pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
81
+ ])
82
+ print(f" Matched timestamp precision: {base_timestamp_dtype}")
83
+
84
+ # Get column lists
85
+ base_cols = set(base_df.columns)
86
+ october_cols = set(october_features.columns)
87
+
88
+ # Find missing columns in October (JAO features likely missing)
89
+ missing_in_october = base_cols - october_cols
90
+ if missing_in_october:
91
+ print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
92
+ for col in missing_in_october:
93
+ if col != 'timestamp':
94
+ october_features = october_features.with_columns([
95
+ pl.lit(None).cast(base_df[col].dtype).alias(col)
96
+ ])
97
+
98
+ # Ensure ALL column dtypes match exactly (not just missing ones)
99
+ print("\n Matching column dtypes...")
100
+ dtype_fixes = []
101
+ for col in base_df.columns:
102
+ if col in october_features.columns:
103
+ base_dtype = base_df[col].dtype
104
+ october_dtype = october_features[col].dtype
105
+ if base_dtype != october_dtype:
106
+ dtype_fixes.append(col)
107
+ october_features = october_features.with_columns([
108
+ pl.col(col).cast(base_dtype).alias(col)
109
+ ])
110
+
111
+ if dtype_fixes:
112
+ print(f" Fixed {len(dtype_fixes)} dtype mismatches")
113
+
114
+ # Ensure column order matches
115
+ october_features = october_features.select(base_df.columns)
116
+
117
+ print("\nAppending October features...")
118
+ extended_df = pl.concat([base_df, october_features], how='vertical')
119
+
120
+ print(f" Extended shape: {extended_df.shape}")
121
+ print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
122
+ print(f" Rows added: {len(extended_df) - len(base_df)}")
123
+
124
+ return extended_df
125
+
126
+
127
+ def validate_extended_dataset(extended_df: pl.DataFrame):
128
+ """Validate extended dataset."""
129
+ print("\n" + "=" * 80)
130
+ print("VALIDATING EXTENDED DATASET")
131
+ print("=" * 80)
132
+
133
+ expected_rows = 17880 # 24 months + 14 days
134
+ expected_cols = 2553 # From metadata
135
+
136
+ print(f"\nShape validation:")
137
+ print(f" Rows: {len(extended_df)} (expected {expected_rows})")
138
+ print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})")
139
+
140
+ # Check for duplicates
141
+ duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
142
+ print(f"\nDuplicate timestamps: {len(duplicates)}")
143
+
144
+ # Check for gaps (skip - Duration comparison not supported in this Polars version)
145
+ # Just verify continuous hourly data by checking row count matches expected
146
+ expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
147
+ actual_hours = len(extended_df)
148
+ print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")
149
+
150
+ # Null counts
151
+ total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
152
+ print(f"\nTotal null values: {total_nulls}")
153
+
154
+ # Date range
155
+ date_start = extended_df['timestamp'].min()
156
+ date_end = extended_df['timestamp'].max()
157
+ print(f"\nDate range:")
158
+ print(f" Start: {date_start}")
159
+ print(f" End: {date_end}")
160
+
161
+ # Validation result
162
+ issues = []
163
+ if len(extended_df) != expected_rows:
164
+ issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
165
+ if len(duplicates) > 0:
166
+ issues.append(f"Found {len(duplicates)} duplicate timestamps")
167
+
168
+ if issues:
169
+ print("\n[WARNING] Validation issues:")
170
+ for issue in issues:
171
+ print(f" - {issue}")
172
+ return False
173
+ else:
174
+ print("\n[OK] All validation checks passed!")
175
+ return True
176
+
177
+
178
+ def main():
179
+ """Main execution: Merge October features and extend dataset."""
180
+ print("\n" + "=" * 80)
181
+ print("DATASET EXTENSION: October 2025")
182
+ print("Extending 24-month dataset (17,544 -> 17,880 rows)")
183
+ print("=" * 80)
184
+
185
+ try:
186
+ # Merge October features
187
+ october_features = merge_october_features()
188
+
189
+ # Extend dataset
190
+ extended_df = extend_dataset(october_features)
191
+
192
+ # Validate
193
+ validation_passed = validate_extended_dataset(extended_df)
194
+
195
+ if validation_passed:
196
+ # Save extended dataset
197
+ output_file = Path("data/processed/features_unified_extended.parquet")
198
+ extended_df.write_parquet(output_file)
199
+
200
+ print("\n" + "=" * 80)
201
+ print("SUCCESS: Dataset extension complete!")
202
+ print("=" * 80)
203
+ print(f"\nExtended dataset saved:")
204
+ print(f" File: {output_file}")
205
+ print(f" Shape: {extended_df.shape}")
206
+ print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
207
+ print("\nNext steps:")
208
+ print(" 1. Upload to HuggingFace Datasets")
209
+ print(" 2. Create inference notebooks")
210
+ print(" 3. Deploy to HF Space")
211
+ else:
212
+ print("\n[ERROR] Validation failed - please review issues")
213
+ sys.exit(1)
214
+
215
+ except Exception as e:
216
+ error_msg = str(e).encode('ascii', 'replace').decode('ascii')
217
+ print(f"\n[ERROR] Dataset extension failed: {error_msg}")
218
+ import traceback
219
+ traceback.print_exc()
220
+ sys.exit(1)
221
+
222
+
223
+ if __name__ == "__main__":
224
+ main()
login.html ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "page.html" %}
2
+
3
+
4
+ {% block stylesheet %}
5
+ {% endblock %}
6
+
7
+ {% block site %}
8
+
9
+ <div id="jupyter-main-app" class="container">
10
+
11
+ <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo">
12
+ <h4>Welcome to JupyterLab</h4>
13
+
14
+ <h5>The default token is <span style="color:orange;">huggingface</span></h5>
15
+
16
+ {% if login_available %}
17
+ {# login_available means password-login is allowed. Show the form. #}
18
+ <div class="row">
19
+ <div class="navbar col-sm-8">
20
+ <div class="navbar-inner">
21
+ <div class="container">
22
+ <div class="center-nav">
23
+ <form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
24
+ {{ xsrf_form_html() | safe }}
25
+ {% if token_available %}
26
+ <label for="password_input"><strong>{% trans %}Jupyter token <span title="This is the secret you set up when deploying your JupyterLab space">ⓘ</span> {% endtrans
27
+ %}</strong></label>
28
+ {% else %}
29
+ <label for="password_input"><strong>{% trans %}Jupyter password:{% endtrans %}</strong></label>
30
+ {% endif %}
31
+ <input type="password" name="password" id="password_input" class="form-control">
32
+ <button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
33
+ %}</button>
34
+ </form>
35
+ </div>
36
+ </div>
37
+ </div>
38
+ </div>
39
+ </div>
40
+ {% else %}
41
+ <p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
42
+ {% endif %}
43
+
44
+ <h5>If you don't have the credentials for this Jupyter space, <a target="_blank" href="https://huggingface.co/spaces/SpacesExamples/jupyterlab?duplicate=true">create your own.</a></h5>
45
+ <br>
46
+
47
+ <p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a> and <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> </p>
48
+ {% if message %}
49
+ <div class="row">
50
+ {% for key in message %}
51
+ <div class="message {{key}}">
52
+ {{message[key]}}
53
+ </div>
54
+ {% endfor %}
55
+ </div>
56
+ {% endif %}
57
+ {% if token_available %}
58
+ {% block token_message %}
59
+
60
+ {% endblock token_message %}
61
+ {% endif %}
62
+ </div>
63
+
64
+ {% endblock %}
65
+
66
+
67
+ {% block script %}
68
+ {% endblock %}
on_startup.sh CHANGED
@@ -1,3 +1,5 @@
1
  #!/bin/bash
2
-
3
- # Startup script - runs as root before container starts
 
 
 
1
  #!/bin/bash
2
+ # Write some commands here that will run on root user before startup.
3
+ # For example, to clone transformers and install it in dev mode:
4
+ # git clone https://github.com/huggingface/transformers.git
5
+ # cd transformers && pip install -e ".[dev]"
packages.txt CHANGED
@@ -0,0 +1 @@
 
 
1
+ tree
process_october_features.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Process October 2025 raw data into features for dataset extension.
2
+
3
+ This script processes the October 2025 raw data (downloaded Nov 13) and generates
4
+ feature files matching the 24-month dataset schema:
5
+ - Weather features: 375 features
6
+ - ENTSO-E features: ~1,863 features
7
+ - JAO features: 276 features (if October data exists)
8
+
9
+ Output files will be saved to data/processed/ with "_october" suffix.
10
+
11
+ Author: Claude
12
+ Date: 2025-11-14
13
+ """
14
+ from pathlib import Path
15
+ import polars as pl
16
+ import sys
17
+
18
+ # Add src to path for imports
19
+ sys.path.append(str(Path(__file__).parent / "src"))
20
+
21
+ from feature_engineering.engineer_weather_features import (
22
+ engineer_grid_level_features,
23
+ engineer_temporal_lags,
24
+ engineer_derived_features
25
+ )
26
+ from feature_engineering.engineer_entsoe_features import (
27
+ engineer_generation_features,
28
+ engineer_demand_features,
29
+ engineer_price_features,
30
+ engineer_hydro_storage_features,
31
+ engineer_pumped_storage_features,
32
+ engineer_load_forecast_features,
33
+ engineer_transmission_outage_features
34
+ )
35
+
36
+
37
+ def process_october_weather() -> pl.DataFrame:
38
+ """Process October weather data into 375 features."""
39
+ print("\n" + "=" * 80)
40
+ print("PROCESSING OCTOBER WEATHER DATA")
41
+ print("=" * 80)
42
+
43
+ raw_file = Path("data/raw/weather_october_2025.parquet")
44
+ if not raw_file.exists():
45
+ raise FileNotFoundError(f"Missing: {raw_file}")
46
+
47
+ # Load October weather data
48
+ weather_df = pl.read_parquet(raw_file)
49
+ print(f"\nLoaded weather data: {weather_df.shape}")
50
+ print(f"Date range: {weather_df['timestamp'].min()} to {weather_df['timestamp'].max()}")
51
+
52
+ # Engineer features using existing modules
53
+ features = engineer_grid_level_features(weather_df)
54
+ features = engineer_temporal_lags(features)
55
+ features = engineer_derived_features(features)
56
+
57
+ # Save to processed directory
58
+ output_file = Path("data/processed/features_weather_october.parquet")
59
+ features.write_parquet(output_file)
60
+
61
+ print(f"\n[OK] Weather features saved: {output_file}")
62
+ print(f" Shape: {features.shape}")
63
+ print(f" Features: {len(features.columns) - 1} (+ timestamp)")
64
+
65
+ return features
66
+
67
+
68
+ def process_october_entsoe() -> pl.DataFrame:
69
+ """Process October ENTSO-E data into ~1,863 features."""
70
+ print("\n" + "=" * 80)
71
+ print("PROCESSING OCTOBER ENTSO-E DATA")
72
+ print("=" * 80)
73
+
74
+ # Check which ENTSO-E files exist
75
+ raw_dir = Path("data/raw")
76
+ processed_dir = Path("data/processed")
77
+ required_files = {
78
+ 'generation': raw_dir / "entsoe_generation_october_2025.parquet",
79
+ 'demand': raw_dir / "entsoe_demand_october_2025.parquet",
80
+ 'prices': raw_dir / "entsoe_prices_october_2025.parquet",
81
+ 'hydro_storage': raw_dir / "entsoe_hydro_storage_october_2025.parquet",
82
+ 'pumped_storage': raw_dir / "entsoe_pumped_storage_october_2025.parquet",
83
+ 'load_forecast': raw_dir / "entsoe_load_forecast_october_2025.parquet",
84
+ 'transmission_outages': raw_dir / "entsoe_transmission_outages_october_2025.parquet"
85
+ }
86
+
87
+ # Load CNEC master list (required for transmission outage features)
88
+ cnec_master_path = processed_dir / "cnecs_master_176.csv"
89
+ if not cnec_master_path.exists():
90
+ raise FileNotFoundError(f"Missing CNEC master list: {cnec_master_path}")
91
+ cnec_master_df = pl.read_csv(cnec_master_path)
92
+ print(f"\nLoaded CNEC master list: {cnec_master_df.shape}")
93
+
94
+ # Verify all files exist
95
+ for name, file_path in required_files.items():
96
+ if not file_path.exists():
97
+ print(f"WARNING: Missing {name} file: {file_path}")
98
+
99
+ # Load all datasets
100
+ print("\nLoading ENTSO-E datasets...")
101
+ generation_df = pl.read_parquet(required_files['generation'])
102
+ demand_df = pl.read_parquet(required_files['demand'])
103
+ prices_df = pl.read_parquet(required_files['prices'])
104
+ hydro_storage_df = pl.read_parquet(required_files['hydro_storage'])
105
+ pumped_storage_df = pl.read_parquet(required_files['pumped_storage'])
106
+ load_forecast_df = pl.read_parquet(required_files['load_forecast'])
107
+ transmission_outages_df = pl.read_parquet(required_files['transmission_outages'])
108
+
109
+ print(f" Generation: {generation_df.shape}")
110
+ print(f" Demand: {demand_df.shape}")
111
+ print(f" Prices: {prices_df.shape}")
112
+ print(f" Hydro storage: {hydro_storage_df.shape}")
113
+ print(f" Pumped storage: {pumped_storage_df.shape}")
114
+ print(f" Load forecast: {load_forecast_df.shape}")
115
+ print(f" Transmission outages: {transmission_outages_df.shape}")
116
+
117
+ # Engineer features for each category
118
+ print("\nEngineering ENTSO-E features...")
119
+
120
+ # Generation features (~228 features)
121
+ gen_features = engineer_generation_features(generation_df)
122
+
123
+ # Demand features (24 features)
124
+ demand_features = engineer_demand_features(demand_df)
125
+
126
+ # Price features (24 features)
127
+ price_features = engineer_price_features(prices_df)
128
+
129
+ # Hydro storage features (12 features)
130
+ hydro_features = engineer_hydro_storage_features(hydro_storage_df)
131
+
132
+ # Pumped storage features (10 features)
133
+ pumped_features = engineer_pumped_storage_features(pumped_storage_df)
134
+
135
+ # Load forecast features (12 features)
136
+ load_forecast_features = engineer_load_forecast_features(load_forecast_df)
137
+
138
+ # Transmission outage features (176 features - ALL CNECs)
139
+ # Create hourly range for October (Oct 1-14, 2025)
140
+ import datetime
141
+ october_start = datetime.datetime(2025, 10, 1, 0, 0)
142
+ october_end = datetime.datetime(2025, 10, 14, 23, 0)
143
+ hourly_range = pl.DataFrame({
144
+ 'timestamp': pl.datetime_range(
145
+ october_start,
146
+ october_end,
147
+ interval='1h',
148
+ eager=True
149
+ )
150
+ })
151
+
152
+ transmission_features = engineer_transmission_outage_features(
153
+ transmission_outages_df,
154
+ cnec_master_df,
155
+ hourly_range
156
+ )
157
+
158
+ # Merge all features
159
+ print("\nMerging all ENTSO-E features...")
160
+ features = gen_features
161
+
162
+ # Fix timezone and precision issues - ensure all timestamps are timezone-naive and nanosecond precision
163
+ features = features.with_columns([
164
+ pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
165
+ ])
166
+
167
+ for feat_df, name in [
168
+ (demand_features, "demand"),
169
+ (price_features, "prices"),
170
+ (hydro_features, "hydro_storage"),
171
+ (pumped_features, "pumped_storage"),
172
+ (load_forecast_features, "load_forecast"),
173
+ (transmission_features, "transmission_outages")
174
+ ]:
175
+ # Ensure timezone and precision consistency
176
+ if 'timestamp' in feat_df.columns:
177
+ feat_df = feat_df.with_columns([
178
+ pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
179
+ ])
180
+
181
+ features = features.join(feat_df, on='timestamp', how='left', coalesce=True)
182
+ print(f" Added {name}: {len(feat_df.columns) - 1} features")
183
+
184
+ # Resample to hourly (some datasets have sub-hourly data)
185
+ print("\nResampling to hourly...")
186
+ features = features.with_columns([
187
+ pl.col('timestamp').dt.truncate('1h').alias('timestamp')
188
+ ])
189
+
190
+ # Group by hour and take mean (for any sub-hourly values)
191
+ agg_exprs = [pl.col(c).mean().alias(c) for c in features.columns if c != 'timestamp']
192
+ features = features.group_by('timestamp').agg(agg_exprs).sort('timestamp')
193
+
194
+ print(f" Resampled to {len(features)} hourly rows")
195
+
196
+ # Ensure complete 336-hour range (Oct 1-14) - fill missing hours with forward-fill
197
+ october_start = datetime.datetime(2025, 10, 1, 0, 0)
198
+ october_end = datetime.datetime(2025, 10, 14, 23, 0)
199
+ complete_range = pl.DataFrame({
200
+ 'timestamp': pl.datetime_range(
201
+ october_start,
202
+ october_end,
203
+ interval='1h',
204
+ eager=True
205
+ )
206
+ })
207
+
208
+ # Cast complete_range timestamp to match features precision
209
+ complete_range = complete_range.with_columns([
210
+ pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
211
+ ])
212
+
213
+ # Join to complete range and forward-fill missing values
214
+ features = complete_range.join(features, on='timestamp', how='left')
215
+
216
+ # Forward-fill missing values
217
+ fill_exprs = []
218
+ for col in features.columns:
219
+ if col != 'timestamp':
220
+ fill_exprs.append(pl.col(col).forward_fill().alias(col))
221
+
222
+ if fill_exprs:
223
+ features = features.with_columns(fill_exprs)
224
+
225
+ missing_count = 336 - len(features.filter(pl.all_horizontal(pl.all().is_not_null())))
226
+ if missing_count > 0:
227
+ print(f" Forward-filled {missing_count} missing hours")
228
+
229
+ print(f" Final shape: {len(features)} hourly rows (Oct 1-14)")
230
+
231
+ # Save to processed directory
232
+ output_file = Path("data/processed/features_entsoe_october.parquet")
233
+ features.write_parquet(output_file)
234
+
235
+ print(f"\n[OK] ENTSO-E features saved: {output_file}")
236
+ print(f" Shape: {features.shape}")
237
+ print(f" Features: {len(features.columns) - 1} (+ timestamp)")
238
+
239
+ return features
240
+
241
+
242
+ def process_october_jao() -> pl.DataFrame | None:
243
+ """Process October JAO data into 276 features (if data exists)."""
244
+ print("\n" + "=" * 80)
245
+ print("PROCESSING OCTOBER JAO DATA")
246
+ print("=" * 80)
247
+
248
+ # Check if October JAO data exists
249
+ raw_file = Path("data/raw/jao_october_2025.parquet")
250
+
251
+ if not raw_file.exists():
252
+ print(f"\nINFO: No October JAO data found at {raw_file}")
253
+ print("This is expected - JAO features may be historical only.")
254
+ print("Skipping JAO feature engineering for October.")
255
+ return None
256
+
257
+ # If data exists, process it
258
+ from feature_engineering.engineer_jao_features import (
259
+ engineer_jao_features_all
260
+ )
261
+
262
+ jao_df = pl.read_parquet(raw_file)
263
+ print(f"\nLoaded JAO data: {jao_df.shape}")
264
+
265
+ features = engineer_jao_features_all(jao_df)
266
+
267
+ # Save to processed directory
268
+ output_file = Path("data/processed/features_jao_october.parquet")
269
+ features.write_parquet(output_file)
270
+
271
+ print(f"\n[OK] JAO features saved: {output_file}")
272
+ print(f" Shape: {features.shape}")
273
+
274
+ return features
275
+
276
+
277
+ def validate_october_features():
278
+ """Validate October feature files match expected schema."""
279
+ print("\n" + "=" * 80)
280
+ print("VALIDATING OCTOBER FEATURES")
281
+ print("=" * 80)
282
+
283
+ # Load October feature files
284
+ weather_file = Path("data/processed/features_weather_october.parquet")
285
+ entsoe_file = Path("data/processed/features_entsoe_october.parquet")
286
+ jao_file = Path("data/processed/features_jao_october.parquet")
287
+
288
+ weather_df = pl.read_parquet(weather_file)
289
+ entsoe_df = pl.read_parquet(entsoe_file)
290
+
291
+ print(f"\nWeather features: {weather_df.shape}")
292
+ print(f" Rows (expected 336): {len(weather_df)}")
293
+ print(f" Features (expected 375): {len(weather_df.columns) - 1}")
294
+
295
+ print(f"\nENTSO-E features: {entsoe_df.shape}")
296
+ print(f" Rows (expected 336): {len(entsoe_df)}")
297
+ print(f" Features (expected ~1,863): {len(entsoe_df.columns) - 1}")
298
+
299
+ if jao_file.exists():
300
+ jao_df = pl.read_parquet(jao_file)
301
+ print(f"\nJAO features: {jao_df.shape}")
302
+ print(f" Rows (expected 336): {len(jao_df)}")
303
+ print(f" Features (expected 276): {len(jao_df.columns) - 1}")
304
+ else:
305
+ print("\nJAO features: Not generated (no October JAO data)")
306
+
307
+ # Validate row count (14 days × 24 hours = 336)
308
+ expected_rows = 336
309
+
310
+ issues = []
311
+ if len(weather_df) != expected_rows:
312
+ issues.append(f"Weather rows: {len(weather_df)} (expected {expected_rows})")
313
+ if len(entsoe_df) != expected_rows:
314
+ issues.append(f"ENTSO-E rows: {len(entsoe_df)} (expected {expected_rows})")
315
+
316
+ # Validate date range (Oct 1-14, 2025)
317
+ weather_start = weather_df['timestamp'].min()
318
+ weather_end = weather_df['timestamp'].max()
319
+ entsoe_start = entsoe_df['timestamp'].min()
320
+ entsoe_end = entsoe_df['timestamp'].max()
321
+
322
+ print(f"\nDate ranges:")
323
+ print(f" Weather: {weather_start} to {weather_end}")
324
+ print(f" ENTSO-E: {entsoe_start} to {entsoe_end}")
325
+
326
+ # Check for null values
327
+ weather_nulls = weather_df.null_count().sum_horizontal().to_list()[0]
328
+ entsoe_nulls = entsoe_df.null_count().sum_horizontal().to_list()[0]
329
+
330
+ print(f"\nNull value counts:")
331
+ print(f" Weather: {weather_nulls} nulls")
332
+ print(f" ENTSO-E: {entsoe_nulls} nulls")
333
+
334
+ # Report validation results
335
+ if issues:
336
+ print("\n[WARNING] Validation issues found:")
337
+ for issue in issues:
338
+ print(f" - {issue}")
339
+ else:
340
+ print("\n[OK] All validation checks passed!")
341
+
342
+ return len(issues) == 0
343
+
344
+
345
+ def main():
346
+ """Main execution: Process all October data."""
347
+ print("\n" + "=" * 80)
348
+ print("OCTOBER 2025 FEATURE ENGINEERING")
349
+ print("Processing raw data into features for dataset extension")
350
+ print("=" * 80)
351
+
352
+ try:
353
+ # Process each feature category
354
+ weather_features = process_october_weather()
355
+ entsoe_features = process_october_entsoe()
356
+ jao_features = process_october_jao() # May return None
357
+
358
+ # Validate features
359
+ validation_passed = validate_october_features()
360
+
361
+ if validation_passed:
362
+ print("\n" + "=" * 80)
363
+ print("SUCCESS: October feature engineering complete!")
364
+ print("=" * 80)
365
+ print("\nGenerated files:")
366
+ print(" - data/processed/features_weather_october.parquet")
367
+ print(" - data/processed/features_entsoe_october.parquet")
368
+ if jao_features is not None:
369
+ print(" - data/processed/features_jao_october.parquet")
370
+ print("\nNext steps:")
371
+ print(" 1. Merge October features into unified dataset")
372
+ print(" 2. Append to 24-month dataset (17,544 -> 17,880 rows)")
373
+ print(" 3. Upload extended dataset to HuggingFace")
374
+ else:
375
+ print("\n[ERROR] Validation failed - please review issues above")
376
+ sys.exit(1)
377
+
378
+ except Exception as e:
379
+ # Avoid Unicode errors on Windows console
380
+ error_msg = str(e).encode('ascii', 'replace').decode('ascii')
381
+ print(f"\n[ERROR] Feature engineering failed: {error_msg}")
382
+ import traceback
383
+ traceback.print_exc()
384
+ sys.exit(1)
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()
requirements_hf_space.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Space Requirements for FBMC Chronos-2 Forecasting
2
+ # GPU-optimized dependencies for JupyterLab SDK
3
+
4
+ # Core ML/Data
5
+ torch>=2.0.0
6
+ transformers>=4.35.0
7
+ chronos-forecasting>=1.2.0
8
+ datasets>=2.14.0
9
+ polars>=0.19.0
10
+ pyarrow>=13.0.0
11
+
12
+ # HuggingFace
13
+ huggingface-hub>=0.19.0
14
+
15
+ # Visualization
16
+ altair>=5.0.0
17
+ vega-datasets
18
+
19
+ # Jupyter
20
+ ipykernel
21
+ jupyter
22
+ jupyterlab
23
+
24
+ # Utilities
25
+ python-dotenv
26
+ tqdm
start_server.sh CHANGED
@@ -1,20 +1,19 @@
1
  #!/bin/bash
2
-
3
  JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
4
 
5
- NOTEBOOK_DIR="/home/user/app"
6
 
7
  jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
8
 
9
  jupyter-lab \
10
- --ip 0.0.0.0 \
11
- --port 7860 \
12
- --no-browser \
13
- --allow-root \
14
- --ServerApp.token="$JUPYTER_TOKEN" \
15
- --ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
16
- --ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
17
- --ServerApp.disable_check_xsrf=True \
18
- --LabApp.news_url=None \
19
- --LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
20
- --notebook-dir=$NOTEBOOK_DIR
 
1
  #!/bin/bash
 
2
  JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
3
 
4
+ NOTEBOOK_DIR="/data"
5
 
6
  jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
7
 
8
  jupyter-lab \
9
+ --ip 0.0.0.0 \
10
+ --port 7860 \
11
+ --no-browser \
12
+ --allow-root \
13
+ --ServerApp.token="$JUPYTER_TOKEN" \
14
+ --ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
15
+ --ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
16
+ --ServerApp.disable_check_xsrf=True \
17
+ --LabApp.news_url=None \
18
+ --LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
19
+ --notebook-dir=$NOTEBOOK_DIR
upload_to_hf.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Upload extended dataset to HuggingFace Datasets.
2
+
3
+ Uploads features_unified_extended.parquet (17,880 rows) to replace existing
4
+ 24-month dataset (17,544 rows) on HuggingFace.
5
+
6
+ Dataset: evgueni-p/fbmc-features-24month
7
+ New date range: Oct 1, 2023 - Oct 14, 2025
8
+
9
+ Author: Claude
10
+ Date: 2025-11-14
11
+ """
12
+ from pathlib import Path
13
+ import os
14
+ from datasets import Dataset
15
+ import polars as pl
16
+ from huggingface_hub import login
17
+ import sys
18
+
19
+ # Load environment variables from .env file
20
+ from dotenv import load_dotenv
21
+ load_dotenv()
22
+
23
+
24
+ def upload_extended_dataset():
25
+ """Upload extended dataset to HuggingFace."""
26
+ print("\n" + "=" * 80)
27
+ print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
28
+ print("=" * 80)
29
+
30
+ # Load HF token
31
+ hf_token = os.getenv("HF_TOKEN")
32
+ if not hf_token:
33
+ raise ValueError("HF_TOKEN environment variable not set - check .env file")
34
+
35
+ # Login to HuggingFace
36
+ print("\nAuthenticating with HuggingFace...")
37
+ login(token=hf_token)
38
+ print(" [OK] Logged in")
39
+
40
+ # Load extended dataset
41
+ extended_file = Path("data/processed/features_unified_extended.parquet")
42
+ if not extended_file.exists():
43
+ raise FileNotFoundError(f"Extended dataset not found: {extended_file}")
44
+
45
+ print(f"\nLoading extended dataset...")
46
+ df = pl.read_parquet(extended_file)
47
+ print(f" Shape: {df.shape}")
48
+ print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
49
+ print(f" File size: {extended_file.stat().st_size / 1024 / 1024:.1f} MB")
50
+
51
+ # Convert to HuggingFace Dataset
52
+ print("\nConverting to HuggingFace Dataset format...")
53
+ hf_dataset = Dataset.from_polars(df)
54
+ print(f" [OK] Converted: {hf_dataset}")
55
+
56
+ # Upload to HuggingFace
57
+ dataset_name = "evgueni-p/fbmc-features-24month"
58
+ print(f"\nUploading to HuggingFace: {dataset_name}")
59
+ print(" This may take a few minutes...")
60
+
61
+ hf_dataset.push_to_hub(
62
+ dataset_name,
63
+ token=hf_token,
64
+ private=False # Make public
65
+ )
66
+
67
+ print(f"\n[OK] Dataset uploaded successfully!")
68
+ print(f" URL: https://huggingface.co/datasets/{dataset_name}")
69
+ print(f" Rows: {len(hf_dataset)}")
70
+ print(f" Columns: {len(hf_dataset.column_names)}")
71
+
72
+ return dataset_name
73
+
74
+
75
+ def verify_upload(dataset_name: str):
76
+ """Verify uploaded dataset by downloading and checking shape."""
77
+ print("\n" + "=" * 80)
78
+ print("VERIFYING UPLOAD")
79
+ print("=" * 80)
80
+
81
+ from datasets import load_dataset
82
+
83
+ hf_token = os.getenv("HF_TOKEN")
84
+
85
+ print(f"\nDownloading dataset from HuggingFace...")
86
+ print(f" Dataset: {dataset_name}")
87
+
88
+ downloaded = load_dataset(
89
+ dataset_name,
90
+ split="train",
91
+ token=hf_token
92
+ )
93
+
94
+ print(f"\n[OK] Downloaded successfully!")
95
+ print(f" Shape: {downloaded.shape}")
96
+
97
+ # Convert to Polars for inspection
98
+ df_check = pl.from_arrow(downloaded.data.table)
99
+ print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")
100
+
101
+ # Validate
102
+ expected_rows = 17880
103
+ expected_cols = 2553
104
+
105
+ issues = []
106
+ if downloaded.shape[0] != expected_rows:
107
+ issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
108
+ if downloaded.shape[1] != expected_cols:
109
+ issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")
110
+
111
+ if issues:
112
+ print("\n[WARNING] Validation issues:")
113
+ for issue in issues:
114
+ print(f" - {issue}")
115
+ return False
116
+ else:
117
+ print("\n[OK] Upload verified successfully!")
118
+ return True
119
+
120
+
121
+ def main():
122
+ """Main execution: Upload and verify extended dataset."""
123
+ print("\n" + "=" * 80)
124
+ print("HUGGINGFACE DATASET UPLOAD")
125
+ print("Uploading extended dataset (17,880 rows)")
126
+ print("=" * 80)
127
+
128
+ try:
129
+ # Upload dataset
130
+ dataset_name = upload_extended_dataset()
131
+
132
+ # Verify upload
133
+ verification_passed = verify_upload(dataset_name)
134
+
135
+ if verification_passed:
136
+ print("\n" + "=" * 80)
137
+ print("SUCCESS: Dataset uploaded and verified!")
138
+ print("=" * 80)
139
+ print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
140
+ print("\nNext steps:")
141
+ print(" 1. Create inference notebooks (.ipynb)")
142
+ print(" 2. Create HF Space README.md")
143
+ print(" 3. Deploy notebooks to HF Space")
144
+ print(" 4. Test inference on GPU")
145
+ else:
146
+ print("\n[ERROR] Verification failed")
147
+ sys.exit(1)
148
+
149
+ except Exception as e:
150
+ error_msg = str(e).encode('ascii', 'replace').decode('ascii')
151
+ print(f"\n[ERROR] Upload failed: {error_msg}")
152
+ import traceback
153
+ traceback.print_exc()
154
+ sys.exit(1)
155
+
156
+
157
+ if __name__ == "__main__":
158
+ main()