Spaces:
Sleeping
Sleeping
chore: merge with HF Space template - keep our README and requirements
Browse files- .claude/settings.local.json +9 -1
- .gitattributes +34 -0
- Dockerfile +34 -20
- README_HF_SPACE.md +115 -0
- extend_dataset.py +224 -0
- login.html +68 -0
- on_startup.sh +4 -2
- packages.txt +1 -0
- process_october_features.py +388 -0
- requirements_hf_space.txt +26 -0
- start_server.sh +12 -13
- upload_to_hf.py +158 -0
.claude/settings.local.json
CHANGED
|
@@ -43,7 +43,15 @@
|
|
| 43 |
"Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
|
| 44 |
"WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
|
| 45 |
"Bash(curl:*)",
|
| 46 |
-
"WebFetch(domain:www.eex-transparency.com)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
],
|
| 48 |
"deny": [],
|
| 49 |
"ask": [],
|
|
|
|
| 43 |
"Bash(/c/Users/evgue/.local/bin/uv.exe pip install:*)",
|
| 44 |
"WebFetch(domain:eepublicdownloads.blob.core.windows.net)",
|
| 45 |
"Bash(curl:*)",
|
| 46 |
+
"WebFetch(domain:www.eex-transparency.com)",
|
| 47 |
+
"Bash(cat:*)",
|
| 48 |
+
"Bash(scp:*)",
|
| 49 |
+
"Bash(git commit:*)",
|
| 50 |
+
"WebFetch(domain:www.claude.com)",
|
| 51 |
+
"Bash(xargs ls:*)",
|
| 52 |
+
"Bash(pgrep:*)",
|
| 53 |
+
"Bash(test:*)",
|
| 54 |
+
"WebFetch(domain:jupyter-docker-stacks.readthedocs.io)"
|
| 55 |
],
|
| 56 |
"deny": [],
|
| 57 |
"ask": [],
|
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
|
| 2 |
|
| 3 |
ENV DEBIAN_FRONTEND=noninteractive \
|
| 4 |
-
|
| 5 |
|
| 6 |
# Remove any third-party apt sources to avoid issues with expiring keys.
|
| 7 |
# Install some basic utilities
|
|
@@ -24,68 +24,82 @@ RUN rm -f /etc/apt/sources.list.d/*.list && \
|
|
| 24 |
build-essential \
|
| 25 |
libsndfile-dev \
|
| 26 |
software-properties-common \
|
| 27 |
-
|
| 28 |
|
| 29 |
RUN add-apt-repository ppa:flexiondotorg/nvtop && \
|
| 30 |
apt-get upgrade -y && \
|
| 31 |
apt-get install -y --no-install-recommends nvtop
|
| 32 |
|
| 33 |
-
RUN curl -sL https://deb.nodesource.com/setup_21.x
|
| 34 |
apt-get install -y nodejs && \
|
| 35 |
npm install -g configurable-http-proxy
|
| 36 |
|
|
|
|
| 37 |
WORKDIR /app
|
| 38 |
|
|
|
|
| 39 |
RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
|
| 40 |
-
|
| 41 |
-
|
| 42 |
RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
|
| 43 |
-
|
| 44 |
USER user
|
| 45 |
|
|
|
|
| 46 |
ENV HOME=/home/user
|
| 47 |
-
|
| 48 |
RUN mkdir $HOME/.cache $HOME/.config \
|
| 49 |
-
|
| 50 |
|
|
|
|
| 51 |
ENV CONDA_AUTO_UPDATE_CONDA=false \
|
| 52 |
PATH=$HOME/miniconda/bin:$PATH
|
| 53 |
-
|
| 54 |
RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
WORKDIR $HOME/app
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
USER root
|
| 63 |
|
|
|
|
|
|
|
| 64 |
RUN --mount=target=/root/packages.txt,source=packages.txt \
|
| 65 |
apt-get update && \
|
| 66 |
xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
|
| 67 |
&& rm -rf /var/lib/apt/lists/*
|
| 68 |
|
| 69 |
RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
|
| 70 |
-
|
| 71 |
|
| 72 |
RUN mkdir /data && chown user:user /data
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
USER user
|
| 75 |
|
|
|
|
| 76 |
RUN --mount=target=requirements.txt,source=requirements.txt \
|
| 77 |
pip install --no-cache-dir --upgrade -r requirements.txt
|
| 78 |
|
|
|
|
| 79 |
COPY --chown=user . $HOME/app
|
| 80 |
|
| 81 |
RUN chmod +x start_server.sh
|
| 82 |
|
|
|
|
|
|
|
| 83 |
ENV PYTHONUNBUFFERED=1 \
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
|
| 91 |
CMD ["./start_server.sh"]
|
|
|
|
| 1 |
FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
|
| 2 |
|
| 3 |
ENV DEBIAN_FRONTEND=noninteractive \
|
| 4 |
+
TZ=Europe/Paris
|
| 5 |
|
| 6 |
# Remove any third-party apt sources to avoid issues with expiring keys.
|
| 7 |
# Install some basic utilities
|
|
|
|
| 24 |
build-essential \
|
| 25 |
libsndfile-dev \
|
| 26 |
software-properties-common \
|
| 27 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
| 29 |
RUN add-apt-repository ppa:flexiondotorg/nvtop && \
|
| 30 |
apt-get upgrade -y && \
|
| 31 |
apt-get install -y --no-install-recommends nvtop
|
| 32 |
|
| 33 |
+
RUN curl -sL https://deb.nodesource.com/setup_21.x | bash - && \
|
| 34 |
apt-get install -y nodejs && \
|
| 35 |
npm install -g configurable-http-proxy
|
| 36 |
|
| 37 |
+
# Create a working directory
|
| 38 |
WORKDIR /app
|
| 39 |
|
| 40 |
+
# Create a non-root user and switch to it
|
| 41 |
RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
|
| 42 |
+
&& chown -R user:user /app
|
|
|
|
| 43 |
RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
|
|
|
|
| 44 |
USER user
|
| 45 |
|
| 46 |
+
# All users can use /home/user as their home directory
|
| 47 |
ENV HOME=/home/user
|
|
|
|
| 48 |
RUN mkdir $HOME/.cache $HOME/.config \
|
| 49 |
+
&& chmod -R 777 $HOME
|
| 50 |
|
| 51 |
+
# Set up the Conda environment
|
| 52 |
ENV CONDA_AUTO_UPDATE_CONDA=false \
|
| 53 |
PATH=$HOME/miniconda/bin:$PATH
|
|
|
|
| 54 |
RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
|
| 55 |
+
&& chmod +x ~/miniconda.sh \
|
| 56 |
+
&& ~/miniconda.sh -b -p ~/miniconda \
|
| 57 |
+
&& rm ~/miniconda.sh \
|
| 58 |
+
&& conda clean -ya
|
| 59 |
|
| 60 |
WORKDIR $HOME/app
|
| 61 |
|
| 62 |
+
#######################################
|
| 63 |
+
# Start root user section
|
| 64 |
+
#######################################
|
| 65 |
+
|
| 66 |
USER root
|
| 67 |
|
| 68 |
+
# User Debian packages
|
| 69 |
+
## Security warning : Potential user code executed as root (build time)
|
| 70 |
RUN --mount=target=/root/packages.txt,source=packages.txt \
|
| 71 |
apt-get update && \
|
| 72 |
xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
|
| 73 |
&& rm -rf /var/lib/apt/lists/*
|
| 74 |
|
| 75 |
RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
|
| 76 |
+
bash /root/on_startup.sh
|
| 77 |
|
| 78 |
RUN mkdir /data && chown user:user /data
|
| 79 |
|
| 80 |
+
#######################################
|
| 81 |
+
# End root user section
|
| 82 |
+
#######################################
|
| 83 |
+
|
| 84 |
USER user
|
| 85 |
|
| 86 |
+
# Python packages
|
| 87 |
RUN --mount=target=requirements.txt,source=requirements.txt \
|
| 88 |
pip install --no-cache-dir --upgrade -r requirements.txt
|
| 89 |
|
| 90 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 91 |
COPY --chown=user . $HOME/app
|
| 92 |
|
| 93 |
RUN chmod +x start_server.sh
|
| 94 |
|
| 95 |
+
COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
|
| 96 |
+
|
| 97 |
ENV PYTHONUNBUFFERED=1 \
|
| 98 |
+
GRADIO_ALLOW_FLAGGING=never \
|
| 99 |
+
GRADIO_NUM_PORTS=1 \
|
| 100 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 101 |
+
GRADIO_THEME=huggingface \
|
| 102 |
+
SYSTEM=spaces \
|
| 103 |
+
SHELL=/bin/bash
|
| 104 |
|
| 105 |
CMD ["./start_server.sh"]
|
README_HF_SPACE.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FBMC Chronos-2 Zero-Shot Forecasting
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: jupyterlab
|
| 7 |
+
sdk_version: "4.0.0"
|
| 8 |
+
app_file: inference_smoke_test.ipynb
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
hardware: a10g-small
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# FBMC Flow-Based Market Coupling Forecasting
|
| 15 |
+
|
| 16 |
+
Zero-shot electricity cross-border flow forecasting for 38 European FBMC borders using Amazon Chronos 2.
|
| 17 |
+
|
| 18 |
+
## 🚀 Quick Start
|
| 19 |
+
|
| 20 |
+
This HuggingFace Space provides interactive Jupyter notebooks for running zero-shot forecasts on GPU.
|
| 21 |
+
|
| 22 |
+
### Available Notebooks
|
| 23 |
+
|
| 24 |
+
1. **`inference_smoke_test.ipynb`** - Quick validation (1 border × 7 days, ~1 min)
|
| 25 |
+
2. **`inference_full_14day.ipynb`** - Production forecast (38 borders × 14 days, ~5 min)
|
| 26 |
+
3. **`evaluation.ipynb`** - Performance analysis vs actuals
|
| 27 |
+
|
| 28 |
+
### How to Use
|
| 29 |
+
|
| 30 |
+
1. Open any notebook in JupyterLab
|
| 31 |
+
2. Run all cells (Cell → Run All)
|
| 32 |
+
3. View results and visualizations inline
|
| 33 |
+
|
| 34 |
+
## 📊 Dataset
|
| 35 |
+
|
| 36 |
+
**Source**: [evgueni-p/fbmc-features-24month](https://huggingface.co/datasets/evgueni-p/fbmc-features-24month)
|
| 37 |
+
|
| 38 |
+
- **Rows**: 17,880 hourly observations
|
| 39 |
+
- **Date Range**: Oct 1, 2023 - Oct 14, 2025
|
| 40 |
+
- **Features**: 2,553 engineered features
|
| 41 |
+
- Weather: 375 features (52 grid points)
|
| 42 |
+
- ENTSO-E: ~1,863 features (generation, demand, prices, outages)
|
| 43 |
+
- JAO: 276 features (CNEC binding, RAM, utilization, LTA, net positions)
|
| 44 |
+
- Temporal: 39 features (hour, day, month, etc.)
|
| 45 |
+
|
| 46 |
+
- **Targets**: 38 FBMC cross-border flows (MW)
|
| 47 |
+
|
| 48 |
+
## 🔬 Model
|
| 49 |
+
|
| 50 |
+
**Amazon Chronos 2 Large** (710M parameters)
|
| 51 |
+
- Pre-trained foundation model for time series
|
| 52 |
+
- Zero-shot inference (no fine-tuning)
|
| 53 |
+
- Multivariate forecasting with future covariates
|
| 54 |
+
- Dynamic time-aware data extraction (prevents leakage)
|
| 55 |
+
|
| 56 |
+
## ⚡ Hardware
|
| 57 |
+
|
| 58 |
+
**GPU**: NVIDIA A10G (24GB VRAM)
|
| 59 |
+
- Model inference: ~5 minutes for complete 14-day forecast
|
| 60 |
+
- Recommended for production workloads
|
| 61 |
+
|
| 62 |
+
## 📈 Performance Target
|
| 63 |
+
|
| 64 |
+
**D+1 MAE Goal**: <150 MW per border
|
| 65 |
+
|
| 66 |
+
This is a zero-shot baseline. Fine-tuning (Phase 2) expected to improve accuracy by 20-40%.
|
| 67 |
+
|
| 68 |
+
## 🔐 Requirements
|
| 69 |
+
|
| 70 |
+
Set `HF_TOKEN` in Space secrets to access the private dataset.
|
| 71 |
+
|
| 72 |
+
## 🛠️ Technical Details
|
| 73 |
+
|
| 74 |
+
### Feature Availability Windows
|
| 75 |
+
|
| 76 |
+
The system implements time-aware forecasting to prevent data leakage:
|
| 77 |
+
|
| 78 |
+
- **Full-horizon D+14** (603 features): Weather, CNEC outages, LTA
|
| 79 |
+
- **Partial D+1** (12 features): Load forecasts (masked D+2-D+14)
|
| 80 |
+
- **Historical only** (1,899 features): Prices, generation, demand
|
| 81 |
+
|
| 82 |
+
### Dynamic Forecast System
|
| 83 |
+
|
| 84 |
+
Uses `DynamicForecast` module to extract context and future covariates based on run date:
|
| 85 |
+
- Context window: 512 hours (historical data)
|
| 86 |
+
- Forecast horizon: 336 hours (14 days)
|
| 87 |
+
- Automatic masking for partial availability
|
| 88 |
+
|
| 89 |
+
## 📚 Documentation
|
| 90 |
+
|
| 91 |
+
- [Project Repository](https://github.com/evgspacdmy/fbmc_chronos2)
|
| 92 |
+
- [Activity Log](https://github.com/evgspacdmy/fbmc_chronos2/blob/main/doc/activity.md)
|
| 93 |
+
- [Feature Engineering Details](https://github.com/evgspacdmy/fbmc_chronos2/tree/main/src/feature_engineering)
|
| 94 |
+
|
| 95 |
+
## 🔄 Phase 2 Roadmap
|
| 96 |
+
|
| 97 |
+
Future improvements (not included in zero-shot MVP):
|
| 98 |
+
- Fine-tuning on FBMC data
|
| 99 |
+
- Ensemble methods
|
| 100 |
+
- Probabilistic forecasting
|
| 101 |
+
- Real-time data pipeline
|
| 102 |
+
- Production API
|
| 103 |
+
|
| 104 |
+
## 👤 Author
|
| 105 |
+
|
| 106 |
+
**Evgueni Poloukarov**
|
| 107 |
+
|
| 108 |
+
## 📄 License
|
| 109 |
+
|
| 110 |
+
MIT License - See LICENSE file for details
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
**Last Updated**: 2025-11-14
|
| 115 |
+
**Version**: 1.0.0 (Zero-Shot MVP)
|
extend_dataset.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Extend 24-month dataset with October 2025 features.
|
| 2 |
+
|
| 3 |
+
Merges October feature files and appends to existing 24-month unified dataset.
|
| 4 |
+
Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)
|
| 5 |
+
|
| 6 |
+
Author: Claude
|
| 7 |
+
Date: 2025-11-14
|
| 8 |
+
"""
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import polars as pl
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def merge_october_features() -> pl.DataFrame:
|
| 15 |
+
"""Merge October feature files into single dataframe."""
|
| 16 |
+
print("\n" + "=" * 80)
|
| 17 |
+
print("MERGING OCTOBER FEATURES")
|
| 18 |
+
print("=" * 80)
|
| 19 |
+
|
| 20 |
+
processed_dir = Path("data/processed")
|
| 21 |
+
|
| 22 |
+
# Load October feature files
|
| 23 |
+
weather_file = processed_dir / "features_weather_october.parquet"
|
| 24 |
+
entsoe_file = processed_dir / "features_entsoe_october.parquet"
|
| 25 |
+
jao_file = processed_dir / "features_jao_october.parquet"
|
| 26 |
+
|
| 27 |
+
print("\nLoading October features...")
|
| 28 |
+
weather_df = pl.read_parquet(weather_file)
|
| 29 |
+
# Cast timestamp to nanosecond precision for consistency
|
| 30 |
+
weather_df = weather_df.with_columns([
|
| 31 |
+
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
|
| 32 |
+
])
|
| 33 |
+
print(f" Weather: {weather_df.shape}")
|
| 34 |
+
|
| 35 |
+
entsoe_df = pl.read_parquet(entsoe_file)
|
| 36 |
+
# Ensure timestamp is nanosecond precision
|
| 37 |
+
entsoe_df = entsoe_df.with_columns([
|
| 38 |
+
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
|
| 39 |
+
])
|
| 40 |
+
print(f" ENTSO-E: {entsoe_df.shape}")
|
| 41 |
+
|
| 42 |
+
# Check if JAO features exist
|
| 43 |
+
if jao_file.exists():
|
| 44 |
+
jao_df = pl.read_parquet(jao_file)
|
| 45 |
+
print(f" JAO: {jao_df.shape}")
|
| 46 |
+
else:
|
| 47 |
+
jao_df = None
|
| 48 |
+
print(f" JAO: Not available (will use zeros)")
|
| 49 |
+
|
| 50 |
+
# Merge features
|
| 51 |
+
print("\nMerging features...")
|
| 52 |
+
unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
|
| 53 |
+
print(f" Weather + ENTSO-E: {unified.shape}")
|
| 54 |
+
|
| 55 |
+
if jao_df is not None:
|
| 56 |
+
unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
|
| 57 |
+
print(f" + JAO: {unified.shape}")
|
| 58 |
+
|
| 59 |
+
print(f"\n[OK] October unified features: {unified.shape}")
|
| 60 |
+
return unified
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
|
| 64 |
+
"""Append October features to 24-month dataset."""
|
| 65 |
+
print("\n" + "=" * 80)
|
| 66 |
+
print("EXTENDING 24-MONTH DATASET")
|
| 67 |
+
print("=" * 80)
|
| 68 |
+
|
| 69 |
+
processed_dir = Path("data/processed")
|
| 70 |
+
base_file = processed_dir / "features_unified_24month.parquet"
|
| 71 |
+
|
| 72 |
+
print("\nLoading 24-month dataset...")
|
| 73 |
+
base_df = pl.read_parquet(base_file)
|
| 74 |
+
print(f" Shape: {base_df.shape}")
|
| 75 |
+
print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")
|
| 76 |
+
|
| 77 |
+
# Match October timestamp precision to base dataset
|
| 78 |
+
base_timestamp_dtype = base_df['timestamp'].dtype
|
| 79 |
+
october_features = october_features.with_columns([
|
| 80 |
+
pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
|
| 81 |
+
])
|
| 82 |
+
print(f" Matched timestamp precision: {base_timestamp_dtype}")
|
| 83 |
+
|
| 84 |
+
# Get column lists
|
| 85 |
+
base_cols = set(base_df.columns)
|
| 86 |
+
october_cols = set(october_features.columns)
|
| 87 |
+
|
| 88 |
+
# Find missing columns in October (JAO features likely missing)
|
| 89 |
+
missing_in_october = base_cols - october_cols
|
| 90 |
+
if missing_in_october:
|
| 91 |
+
print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
|
| 92 |
+
for col in missing_in_october:
|
| 93 |
+
if col != 'timestamp':
|
| 94 |
+
october_features = october_features.with_columns([
|
| 95 |
+
pl.lit(None).cast(base_df[col].dtype).alias(col)
|
| 96 |
+
])
|
| 97 |
+
|
| 98 |
+
# Ensure ALL column dtypes match exactly (not just missing ones)
|
| 99 |
+
print("\n Matching column dtypes...")
|
| 100 |
+
dtype_fixes = []
|
| 101 |
+
for col in base_df.columns:
|
| 102 |
+
if col in october_features.columns:
|
| 103 |
+
base_dtype = base_df[col].dtype
|
| 104 |
+
october_dtype = october_features[col].dtype
|
| 105 |
+
if base_dtype != october_dtype:
|
| 106 |
+
dtype_fixes.append(col)
|
| 107 |
+
october_features = october_features.with_columns([
|
| 108 |
+
pl.col(col).cast(base_dtype).alias(col)
|
| 109 |
+
])
|
| 110 |
+
|
| 111 |
+
if dtype_fixes:
|
| 112 |
+
print(f" Fixed {len(dtype_fixes)} dtype mismatches")
|
| 113 |
+
|
| 114 |
+
# Ensure column order matches
|
| 115 |
+
october_features = october_features.select(base_df.columns)
|
| 116 |
+
|
| 117 |
+
print("\nAppending October features...")
|
| 118 |
+
extended_df = pl.concat([base_df, october_features], how='vertical')
|
| 119 |
+
|
| 120 |
+
print(f" Extended shape: {extended_df.shape}")
|
| 121 |
+
print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
|
| 122 |
+
print(f" Rows added: {len(extended_df) - len(base_df)}")
|
| 123 |
+
|
| 124 |
+
return extended_df
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def validate_extended_dataset(extended_df: pl.DataFrame):
|
| 128 |
+
"""Validate extended dataset."""
|
| 129 |
+
print("\n" + "=" * 80)
|
| 130 |
+
print("VALIDATING EXTENDED DATASET")
|
| 131 |
+
print("=" * 80)
|
| 132 |
+
|
| 133 |
+
expected_rows = 17880 # 24 months + 14 days
|
| 134 |
+
expected_cols = 2553 # From metadata
|
| 135 |
+
|
| 136 |
+
print(f"\nShape validation:")
|
| 137 |
+
print(f" Rows: {len(extended_df)} (expected {expected_rows})")
|
| 138 |
+
print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})")
|
| 139 |
+
|
| 140 |
+
# Check for duplicates
|
| 141 |
+
duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
|
| 142 |
+
print(f"\nDuplicate timestamps: {len(duplicates)}")
|
| 143 |
+
|
| 144 |
+
# Check for gaps (skip - Duration comparison not supported in this Polars version)
|
| 145 |
+
# Just verify continuous hourly data by checking row count matches expected
|
| 146 |
+
expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
|
| 147 |
+
actual_hours = len(extended_df)
|
| 148 |
+
print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")
|
| 149 |
+
|
| 150 |
+
# Null counts
|
| 151 |
+
total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
|
| 152 |
+
print(f"\nTotal null values: {total_nulls}")
|
| 153 |
+
|
| 154 |
+
# Date range
|
| 155 |
+
date_start = extended_df['timestamp'].min()
|
| 156 |
+
date_end = extended_df['timestamp'].max()
|
| 157 |
+
print(f"\nDate range:")
|
| 158 |
+
print(f" Start: {date_start}")
|
| 159 |
+
print(f" End: {date_end}")
|
| 160 |
+
|
| 161 |
+
# Validation result
|
| 162 |
+
issues = []
|
| 163 |
+
if len(extended_df) != expected_rows:
|
| 164 |
+
issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
|
| 165 |
+
if len(duplicates) > 0:
|
| 166 |
+
issues.append(f"Found {len(duplicates)} duplicate timestamps")
|
| 167 |
+
|
| 168 |
+
if issues:
|
| 169 |
+
print("\n[WARNING] Validation issues:")
|
| 170 |
+
for issue in issues:
|
| 171 |
+
print(f" - {issue}")
|
| 172 |
+
return False
|
| 173 |
+
else:
|
| 174 |
+
print("\n[OK] All validation checks passed!")
|
| 175 |
+
return True
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def main():
|
| 179 |
+
"""Main execution: Merge October features and extend dataset."""
|
| 180 |
+
print("\n" + "=" * 80)
|
| 181 |
+
print("DATASET EXTENSION: October 2025")
|
| 182 |
+
print("Extending 24-month dataset (17,544 -> 17,880 rows)")
|
| 183 |
+
print("=" * 80)
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
# Merge October features
|
| 187 |
+
october_features = merge_october_features()
|
| 188 |
+
|
| 189 |
+
# Extend dataset
|
| 190 |
+
extended_df = extend_dataset(october_features)
|
| 191 |
+
|
| 192 |
+
# Validate
|
| 193 |
+
validation_passed = validate_extended_dataset(extended_df)
|
| 194 |
+
|
| 195 |
+
if validation_passed:
|
| 196 |
+
# Save extended dataset
|
| 197 |
+
output_file = Path("data/processed/features_unified_extended.parquet")
|
| 198 |
+
extended_df.write_parquet(output_file)
|
| 199 |
+
|
| 200 |
+
print("\n" + "=" * 80)
|
| 201 |
+
print("SUCCESS: Dataset extension complete!")
|
| 202 |
+
print("=" * 80)
|
| 203 |
+
print(f"\nExtended dataset saved:")
|
| 204 |
+
print(f" File: {output_file}")
|
| 205 |
+
print(f" Shape: {extended_df.shape}")
|
| 206 |
+
print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
|
| 207 |
+
print("\nNext steps:")
|
| 208 |
+
print(" 1. Upload to HuggingFace Datasets")
|
| 209 |
+
print(" 2. Create inference notebooks")
|
| 210 |
+
print(" 3. Deploy to HF Space")
|
| 211 |
+
else:
|
| 212 |
+
print("\n[ERROR] Validation failed - please review issues")
|
| 213 |
+
sys.exit(1)
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
|
| 217 |
+
print(f"\n[ERROR] Dataset extension failed: {error_msg}")
|
| 218 |
+
import traceback
|
| 219 |
+
traceback.print_exc()
|
| 220 |
+
sys.exit(1)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
if __name__ == "__main__":
|
| 224 |
+
main()
|
login.html
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "page.html" %}
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
{% block stylesheet %}
|
| 5 |
+
{% endblock %}
|
| 6 |
+
|
| 7 |
+
{% block site %}
|
| 8 |
+
|
| 9 |
+
<div id="jupyter-main-app" class="container">
|
| 10 |
+
|
| 11 |
+
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo">
|
| 12 |
+
<h4>Welcome to JupyterLab</h4>
|
| 13 |
+
|
| 14 |
+
<h5>The default token is <span style="color:orange;">huggingface</span></h5>
|
| 15 |
+
|
| 16 |
+
{% if login_available %}
|
| 17 |
+
{# login_available means password-login is allowed. Show the form. #}
|
| 18 |
+
<div class="row">
|
| 19 |
+
<div class="navbar col-sm-8">
|
| 20 |
+
<div class="navbar-inner">
|
| 21 |
+
<div class="container">
|
| 22 |
+
<div class="center-nav">
|
| 23 |
+
<form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
|
| 24 |
+
{{ xsrf_form_html() | safe }}
|
| 25 |
+
{% if token_available %}
|
| 26 |
+
<label for="password_input"><strong>{% trans %}Jupyter token <span title="This is the secret you set up when deploying your JupyterLab space">ⓘ</span> {% endtrans
|
| 27 |
+
%}</strong></label>
|
| 28 |
+
{% else %}
|
| 29 |
+
<label for="password_input"><strong>{% trans %}Jupyter password:{% endtrans %}</strong></label>
|
| 30 |
+
{% endif %}
|
| 31 |
+
<input type="password" name="password" id="password_input" class="form-control">
|
| 32 |
+
<button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
|
| 33 |
+
%}</button>
|
| 34 |
+
</form>
|
| 35 |
+
</div>
|
| 36 |
+
</div>
|
| 37 |
+
</div>
|
| 38 |
+
</div>
|
| 39 |
+
</div>
|
| 40 |
+
{% else %}
|
| 41 |
+
<p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
|
| 42 |
+
{% endif %}
|
| 43 |
+
|
| 44 |
+
<h5>If you don't have the credentials for this Jupyter space, <a target="_blank" href="https://huggingface.co/spaces/SpacesExamples/jupyterlab?duplicate=true">create your own.</a></h5>
|
| 45 |
+
<br>
|
| 46 |
+
|
| 47 |
+
<p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a> and <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> </p>
|
| 48 |
+
{% if message %}
|
| 49 |
+
<div class="row">
|
| 50 |
+
{% for key in message %}
|
| 51 |
+
<div class="message {{key}}">
|
| 52 |
+
{{message[key]}}
|
| 53 |
+
</div>
|
| 54 |
+
{% endfor %}
|
| 55 |
+
</div>
|
| 56 |
+
{% endif %}
|
| 57 |
+
{% if token_available %}
|
| 58 |
+
{% block token_message %}
|
| 59 |
+
|
| 60 |
+
{% endblock token_message %}
|
| 61 |
+
{% endif %}
|
| 62 |
+
</div>
|
| 63 |
+
|
| 64 |
+
{% endblock %}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
{% block script %}
|
| 68 |
+
{% endblock %}
|
on_startup.sh
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
#
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
# Write some commands here that will run on root user before startup.
|
| 3 |
+
# For example, to clone transformers and install it in dev mode:
|
| 4 |
+
# git clone https://github.com/huggingface/transformers.git
|
| 5 |
+
# cd transformers && pip install -e ".[dev]"
|
packages.txt
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
tree
|
process_october_features.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Process October 2025 raw data into features for dataset extension.
|
| 2 |
+
|
| 3 |
+
This script processes the October 2025 raw data (downloaded Nov 13) and generates
|
| 4 |
+
feature files matching the 24-month dataset schema:
|
| 5 |
+
- Weather features: 375 features
|
| 6 |
+
- ENTSO-E features: ~1,863 features
|
| 7 |
+
- JAO features: 276 features (if October data exists)
|
| 8 |
+
|
| 9 |
+
Output files will be saved to data/processed/ with "_october" suffix.
|
| 10 |
+
|
| 11 |
+
Author: Claude
|
| 12 |
+
Date: 2025-11-14
|
| 13 |
+
"""
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import polars as pl
|
| 16 |
+
import sys
|
| 17 |
+
|
| 18 |
+
# Add src to path for imports
|
| 19 |
+
sys.path.append(str(Path(__file__).parent / "src"))
|
| 20 |
+
|
| 21 |
+
from feature_engineering.engineer_weather_features import (
|
| 22 |
+
engineer_grid_level_features,
|
| 23 |
+
engineer_temporal_lags,
|
| 24 |
+
engineer_derived_features
|
| 25 |
+
)
|
| 26 |
+
from feature_engineering.engineer_entsoe_features import (
|
| 27 |
+
engineer_generation_features,
|
| 28 |
+
engineer_demand_features,
|
| 29 |
+
engineer_price_features,
|
| 30 |
+
engineer_hydro_storage_features,
|
| 31 |
+
engineer_pumped_storage_features,
|
| 32 |
+
engineer_load_forecast_features,
|
| 33 |
+
engineer_transmission_outage_features
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def process_october_weather() -> pl.DataFrame:
|
| 38 |
+
"""Process October weather data into 375 features."""
|
| 39 |
+
print("\n" + "=" * 80)
|
| 40 |
+
print("PROCESSING OCTOBER WEATHER DATA")
|
| 41 |
+
print("=" * 80)
|
| 42 |
+
|
| 43 |
+
raw_file = Path("data/raw/weather_october_2025.parquet")
|
| 44 |
+
if not raw_file.exists():
|
| 45 |
+
raise FileNotFoundError(f"Missing: {raw_file}")
|
| 46 |
+
|
| 47 |
+
# Load October weather data
|
| 48 |
+
weather_df = pl.read_parquet(raw_file)
|
| 49 |
+
print(f"\nLoaded weather data: {weather_df.shape}")
|
| 50 |
+
print(f"Date range: {weather_df['timestamp'].min()} to {weather_df['timestamp'].max()}")
|
| 51 |
+
|
| 52 |
+
# Engineer features using existing modules
|
| 53 |
+
features = engineer_grid_level_features(weather_df)
|
| 54 |
+
features = engineer_temporal_lags(features)
|
| 55 |
+
features = engineer_derived_features(features)
|
| 56 |
+
|
| 57 |
+
# Save to processed directory
|
| 58 |
+
output_file = Path("data/processed/features_weather_october.parquet")
|
| 59 |
+
features.write_parquet(output_file)
|
| 60 |
+
|
| 61 |
+
print(f"\n[OK] Weather features saved: {output_file}")
|
| 62 |
+
print(f" Shape: {features.shape}")
|
| 63 |
+
print(f" Features: {len(features.columns) - 1} (+ timestamp)")
|
| 64 |
+
|
| 65 |
+
return features
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def process_october_entsoe() -> pl.DataFrame:
|
| 69 |
+
"""Process October ENTSO-E data into ~1,863 features."""
|
| 70 |
+
print("\n" + "=" * 80)
|
| 71 |
+
print("PROCESSING OCTOBER ENTSO-E DATA")
|
| 72 |
+
print("=" * 80)
|
| 73 |
+
|
| 74 |
+
# Check which ENTSO-E files exist
|
| 75 |
+
raw_dir = Path("data/raw")
|
| 76 |
+
processed_dir = Path("data/processed")
|
| 77 |
+
required_files = {
|
| 78 |
+
'generation': raw_dir / "entsoe_generation_october_2025.parquet",
|
| 79 |
+
'demand': raw_dir / "entsoe_demand_october_2025.parquet",
|
| 80 |
+
'prices': raw_dir / "entsoe_prices_october_2025.parquet",
|
| 81 |
+
'hydro_storage': raw_dir / "entsoe_hydro_storage_october_2025.parquet",
|
| 82 |
+
'pumped_storage': raw_dir / "entsoe_pumped_storage_october_2025.parquet",
|
| 83 |
+
'load_forecast': raw_dir / "entsoe_load_forecast_october_2025.parquet",
|
| 84 |
+
'transmission_outages': raw_dir / "entsoe_transmission_outages_october_2025.parquet"
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Load CNEC master list (required for transmission outage features)
|
| 88 |
+
cnec_master_path = processed_dir / "cnecs_master_176.csv"
|
| 89 |
+
if not cnec_master_path.exists():
|
| 90 |
+
raise FileNotFoundError(f"Missing CNEC master list: {cnec_master_path}")
|
| 91 |
+
cnec_master_df = pl.read_csv(cnec_master_path)
|
| 92 |
+
print(f"\nLoaded CNEC master list: {cnec_master_df.shape}")
|
| 93 |
+
|
| 94 |
+
# Verify all files exist
|
| 95 |
+
for name, file_path in required_files.items():
|
| 96 |
+
if not file_path.exists():
|
| 97 |
+
print(f"WARNING: Missing {name} file: {file_path}")
|
| 98 |
+
|
| 99 |
+
# Load all datasets
|
| 100 |
+
print("\nLoading ENTSO-E datasets...")
|
| 101 |
+
generation_df = pl.read_parquet(required_files['generation'])
|
| 102 |
+
demand_df = pl.read_parquet(required_files['demand'])
|
| 103 |
+
prices_df = pl.read_parquet(required_files['prices'])
|
| 104 |
+
hydro_storage_df = pl.read_parquet(required_files['hydro_storage'])
|
| 105 |
+
pumped_storage_df = pl.read_parquet(required_files['pumped_storage'])
|
| 106 |
+
load_forecast_df = pl.read_parquet(required_files['load_forecast'])
|
| 107 |
+
transmission_outages_df = pl.read_parquet(required_files['transmission_outages'])
|
| 108 |
+
|
| 109 |
+
print(f" Generation: {generation_df.shape}")
|
| 110 |
+
print(f" Demand: {demand_df.shape}")
|
| 111 |
+
print(f" Prices: {prices_df.shape}")
|
| 112 |
+
print(f" Hydro storage: {hydro_storage_df.shape}")
|
| 113 |
+
print(f" Pumped storage: {pumped_storage_df.shape}")
|
| 114 |
+
print(f" Load forecast: {load_forecast_df.shape}")
|
| 115 |
+
print(f" Transmission outages: {transmission_outages_df.shape}")
|
| 116 |
+
|
| 117 |
+
# Engineer features for each category
|
| 118 |
+
print("\nEngineering ENTSO-E features...")
|
| 119 |
+
|
| 120 |
+
# Generation features (~228 features)
|
| 121 |
+
gen_features = engineer_generation_features(generation_df)
|
| 122 |
+
|
| 123 |
+
# Demand features (24 features)
|
| 124 |
+
demand_features = engineer_demand_features(demand_df)
|
| 125 |
+
|
| 126 |
+
# Price features (24 features)
|
| 127 |
+
price_features = engineer_price_features(prices_df)
|
| 128 |
+
|
| 129 |
+
# Hydro storage features (12 features)
|
| 130 |
+
hydro_features = engineer_hydro_storage_features(hydro_storage_df)
|
| 131 |
+
|
| 132 |
+
# Pumped storage features (10 features)
|
| 133 |
+
pumped_features = engineer_pumped_storage_features(pumped_storage_df)
|
| 134 |
+
|
| 135 |
+
# Load forecast features (12 features)
|
| 136 |
+
load_forecast_features = engineer_load_forecast_features(load_forecast_df)
|
| 137 |
+
|
| 138 |
+
# Transmission outage features (176 features - ALL CNECs)
|
| 139 |
+
# Create hourly range for October (Oct 1-14, 2025)
|
| 140 |
+
import datetime
|
| 141 |
+
october_start = datetime.datetime(2025, 10, 1, 0, 0)
|
| 142 |
+
october_end = datetime.datetime(2025, 10, 14, 23, 0)
|
| 143 |
+
hourly_range = pl.DataFrame({
|
| 144 |
+
'timestamp': pl.datetime_range(
|
| 145 |
+
october_start,
|
| 146 |
+
october_end,
|
| 147 |
+
interval='1h',
|
| 148 |
+
eager=True
|
| 149 |
+
)
|
| 150 |
+
})
|
| 151 |
+
|
| 152 |
+
transmission_features = engineer_transmission_outage_features(
|
| 153 |
+
transmission_outages_df,
|
| 154 |
+
cnec_master_df,
|
| 155 |
+
hourly_range
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Merge all features
|
| 159 |
+
print("\nMerging all ENTSO-E features...")
|
| 160 |
+
features = gen_features
|
| 161 |
+
|
| 162 |
+
# Fix timezone and precision issues - ensure all timestamps are timezone-naive and nanosecond precision
|
| 163 |
+
features = features.with_columns([
|
| 164 |
+
pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
|
| 165 |
+
])
|
| 166 |
+
|
| 167 |
+
for feat_df, name in [
|
| 168 |
+
(demand_features, "demand"),
|
| 169 |
+
(price_features, "prices"),
|
| 170 |
+
(hydro_features, "hydro_storage"),
|
| 171 |
+
(pumped_features, "pumped_storage"),
|
| 172 |
+
(load_forecast_features, "load_forecast"),
|
| 173 |
+
(transmission_features, "transmission_outages")
|
| 174 |
+
]:
|
| 175 |
+
# Ensure timezone and precision consistency
|
| 176 |
+
if 'timestamp' in feat_df.columns:
|
| 177 |
+
feat_df = feat_df.with_columns([
|
| 178 |
+
pl.col('timestamp').dt.replace_time_zone(None).dt.cast_time_unit('ns').alias('timestamp')
|
| 179 |
+
])
|
| 180 |
+
|
| 181 |
+
features = features.join(feat_df, on='timestamp', how='left', coalesce=True)
|
| 182 |
+
print(f" Added {name}: {len(feat_df.columns) - 1} features")
|
| 183 |
+
|
| 184 |
+
# Resample to hourly (some datasets have sub-hourly data)
|
| 185 |
+
print("\nResampling to hourly...")
|
| 186 |
+
features = features.with_columns([
|
| 187 |
+
pl.col('timestamp').dt.truncate('1h').alias('timestamp')
|
| 188 |
+
])
|
| 189 |
+
|
| 190 |
+
# Group by hour and take mean (for any sub-hourly values)
|
| 191 |
+
agg_exprs = [pl.col(c).mean().alias(c) for c in features.columns if c != 'timestamp']
|
| 192 |
+
features = features.group_by('timestamp').agg(agg_exprs).sort('timestamp')
|
| 193 |
+
|
| 194 |
+
print(f" Resampled to {len(features)} hourly rows")
|
| 195 |
+
|
| 196 |
+
# Ensure complete 336-hour range (Oct 1-14) - fill missing hours with forward-fill
|
| 197 |
+
october_start = datetime.datetime(2025, 10, 1, 0, 0)
|
| 198 |
+
october_end = datetime.datetime(2025, 10, 14, 23, 0)
|
| 199 |
+
complete_range = pl.DataFrame({
|
| 200 |
+
'timestamp': pl.datetime_range(
|
| 201 |
+
october_start,
|
| 202 |
+
october_end,
|
| 203 |
+
interval='1h',
|
| 204 |
+
eager=True
|
| 205 |
+
)
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
# Cast complete_range timestamp to match features precision
|
| 209 |
+
complete_range = complete_range.with_columns([
|
| 210 |
+
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
|
| 211 |
+
])
|
| 212 |
+
|
| 213 |
+
# Join to complete range and forward-fill missing values
|
| 214 |
+
features = complete_range.join(features, on='timestamp', how='left')
|
| 215 |
+
|
| 216 |
+
# Forward-fill missing values
|
| 217 |
+
fill_exprs = []
|
| 218 |
+
for col in features.columns:
|
| 219 |
+
if col != 'timestamp':
|
| 220 |
+
fill_exprs.append(pl.col(col).forward_fill().alias(col))
|
| 221 |
+
|
| 222 |
+
if fill_exprs:
|
| 223 |
+
features = features.with_columns(fill_exprs)
|
| 224 |
+
|
| 225 |
+
missing_count = 336 - len(features.filter(pl.all_horizontal(pl.all().is_not_null())))
|
| 226 |
+
if missing_count > 0:
|
| 227 |
+
print(f" Forward-filled {missing_count} missing hours")
|
| 228 |
+
|
| 229 |
+
print(f" Final shape: {len(features)} hourly rows (Oct 1-14)")
|
| 230 |
+
|
| 231 |
+
# Save to processed directory
|
| 232 |
+
output_file = Path("data/processed/features_entsoe_october.parquet")
|
| 233 |
+
features.write_parquet(output_file)
|
| 234 |
+
|
| 235 |
+
print(f"\n[OK] ENTSO-E features saved: {output_file}")
|
| 236 |
+
print(f" Shape: {features.shape}")
|
| 237 |
+
print(f" Features: {len(features.columns) - 1} (+ timestamp)")
|
| 238 |
+
|
| 239 |
+
return features
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def process_october_jao() -> pl.DataFrame | None:
|
| 243 |
+
"""Process October JAO data into 276 features (if data exists)."""
|
| 244 |
+
print("\n" + "=" * 80)
|
| 245 |
+
print("PROCESSING OCTOBER JAO DATA")
|
| 246 |
+
print("=" * 80)
|
| 247 |
+
|
| 248 |
+
# Check if October JAO data exists
|
| 249 |
+
raw_file = Path("data/raw/jao_october_2025.parquet")
|
| 250 |
+
|
| 251 |
+
if not raw_file.exists():
|
| 252 |
+
print(f"\nINFO: No October JAO data found at {raw_file}")
|
| 253 |
+
print("This is expected - JAO features may be historical only.")
|
| 254 |
+
print("Skipping JAO feature engineering for October.")
|
| 255 |
+
return None
|
| 256 |
+
|
| 257 |
+
# If data exists, process it
|
| 258 |
+
from feature_engineering.engineer_jao_features import (
|
| 259 |
+
engineer_jao_features_all
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
jao_df = pl.read_parquet(raw_file)
|
| 263 |
+
print(f"\nLoaded JAO data: {jao_df.shape}")
|
| 264 |
+
|
| 265 |
+
features = engineer_jao_features_all(jao_df)
|
| 266 |
+
|
| 267 |
+
# Save to processed directory
|
| 268 |
+
output_file = Path("data/processed/features_jao_october.parquet")
|
| 269 |
+
features.write_parquet(output_file)
|
| 270 |
+
|
| 271 |
+
print(f"\n[OK] JAO features saved: {output_file}")
|
| 272 |
+
print(f" Shape: {features.shape}")
|
| 273 |
+
|
| 274 |
+
return features
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def validate_october_features():
|
| 278 |
+
"""Validate October feature files match expected schema."""
|
| 279 |
+
print("\n" + "=" * 80)
|
| 280 |
+
print("VALIDATING OCTOBER FEATURES")
|
| 281 |
+
print("=" * 80)
|
| 282 |
+
|
| 283 |
+
# Load October feature files
|
| 284 |
+
weather_file = Path("data/processed/features_weather_october.parquet")
|
| 285 |
+
entsoe_file = Path("data/processed/features_entsoe_october.parquet")
|
| 286 |
+
jao_file = Path("data/processed/features_jao_october.parquet")
|
| 287 |
+
|
| 288 |
+
weather_df = pl.read_parquet(weather_file)
|
| 289 |
+
entsoe_df = pl.read_parquet(entsoe_file)
|
| 290 |
+
|
| 291 |
+
print(f"\nWeather features: {weather_df.shape}")
|
| 292 |
+
print(f" Rows (expected 336): {len(weather_df)}")
|
| 293 |
+
print(f" Features (expected 375): {len(weather_df.columns) - 1}")
|
| 294 |
+
|
| 295 |
+
print(f"\nENTSO-E features: {entsoe_df.shape}")
|
| 296 |
+
print(f" Rows (expected 336): {len(entsoe_df)}")
|
| 297 |
+
print(f" Features (expected ~1,863): {len(entsoe_df.columns) - 1}")
|
| 298 |
+
|
| 299 |
+
if jao_file.exists():
|
| 300 |
+
jao_df = pl.read_parquet(jao_file)
|
| 301 |
+
print(f"\nJAO features: {jao_df.shape}")
|
| 302 |
+
print(f" Rows (expected 336): {len(jao_df)}")
|
| 303 |
+
print(f" Features (expected 276): {len(jao_df.columns) - 1}")
|
| 304 |
+
else:
|
| 305 |
+
print("\nJAO features: Not generated (no October JAO data)")
|
| 306 |
+
|
| 307 |
+
# Validate row count (14 days × 24 hours = 336)
|
| 308 |
+
expected_rows = 336
|
| 309 |
+
|
| 310 |
+
issues = []
|
| 311 |
+
if len(weather_df) != expected_rows:
|
| 312 |
+
issues.append(f"Weather rows: {len(weather_df)} (expected {expected_rows})")
|
| 313 |
+
if len(entsoe_df) != expected_rows:
|
| 314 |
+
issues.append(f"ENTSO-E rows: {len(entsoe_df)} (expected {expected_rows})")
|
| 315 |
+
|
| 316 |
+
# Validate date range (Oct 1-14, 2025)
|
| 317 |
+
weather_start = weather_df['timestamp'].min()
|
| 318 |
+
weather_end = weather_df['timestamp'].max()
|
| 319 |
+
entsoe_start = entsoe_df['timestamp'].min()
|
| 320 |
+
entsoe_end = entsoe_df['timestamp'].max()
|
| 321 |
+
|
| 322 |
+
print(f"\nDate ranges:")
|
| 323 |
+
print(f" Weather: {weather_start} to {weather_end}")
|
| 324 |
+
print(f" ENTSO-E: {entsoe_start} to {entsoe_end}")
|
| 325 |
+
|
| 326 |
+
# Check for null values
|
| 327 |
+
weather_nulls = weather_df.null_count().sum_horizontal().to_list()[0]
|
| 328 |
+
entsoe_nulls = entsoe_df.null_count().sum_horizontal().to_list()[0]
|
| 329 |
+
|
| 330 |
+
print(f"\nNull value counts:")
|
| 331 |
+
print(f" Weather: {weather_nulls} nulls")
|
| 332 |
+
print(f" ENTSO-E: {entsoe_nulls} nulls")
|
| 333 |
+
|
| 334 |
+
# Report validation results
|
| 335 |
+
if issues:
|
| 336 |
+
print("\n[WARNING] Validation issues found:")
|
| 337 |
+
for issue in issues:
|
| 338 |
+
print(f" - {issue}")
|
| 339 |
+
else:
|
| 340 |
+
print("\n[OK] All validation checks passed!")
|
| 341 |
+
|
| 342 |
+
return len(issues) == 0
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
def main():
|
| 346 |
+
"""Main execution: Process all October data."""
|
| 347 |
+
print("\n" + "=" * 80)
|
| 348 |
+
print("OCTOBER 2025 FEATURE ENGINEERING")
|
| 349 |
+
print("Processing raw data into features for dataset extension")
|
| 350 |
+
print("=" * 80)
|
| 351 |
+
|
| 352 |
+
try:
|
| 353 |
+
# Process each feature category
|
| 354 |
+
weather_features = process_october_weather()
|
| 355 |
+
entsoe_features = process_october_entsoe()
|
| 356 |
+
jao_features = process_october_jao() # May return None
|
| 357 |
+
|
| 358 |
+
# Validate features
|
| 359 |
+
validation_passed = validate_october_features()
|
| 360 |
+
|
| 361 |
+
if validation_passed:
|
| 362 |
+
print("\n" + "=" * 80)
|
| 363 |
+
print("SUCCESS: October feature engineering complete!")
|
| 364 |
+
print("=" * 80)
|
| 365 |
+
print("\nGenerated files:")
|
| 366 |
+
print(" - data/processed/features_weather_october.parquet")
|
| 367 |
+
print(" - data/processed/features_entsoe_october.parquet")
|
| 368 |
+
if jao_features is not None:
|
| 369 |
+
print(" - data/processed/features_jao_october.parquet")
|
| 370 |
+
print("\nNext steps:")
|
| 371 |
+
print(" 1. Merge October features into unified dataset")
|
| 372 |
+
print(" 2. Append to 24-month dataset (17,544 -> 17,880 rows)")
|
| 373 |
+
print(" 3. Upload extended dataset to HuggingFace")
|
| 374 |
+
else:
|
| 375 |
+
print("\n[ERROR] Validation failed - please review issues above")
|
| 376 |
+
sys.exit(1)
|
| 377 |
+
|
| 378 |
+
except Exception as e:
|
| 379 |
+
# Avoid Unicode errors on Windows console
|
| 380 |
+
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
|
| 381 |
+
print(f"\n[ERROR] Feature engineering failed: {error_msg}")
|
| 382 |
+
import traceback
|
| 383 |
+
traceback.print_exc()
|
| 384 |
+
sys.exit(1)
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
if __name__ == "__main__":
|
| 388 |
+
main()
|
requirements_hf_space.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HuggingFace Space Requirements for FBMC Chronos-2 Forecasting
|
| 2 |
+
# GPU-optimized dependencies for JupyterLab SDK
|
| 3 |
+
|
| 4 |
+
# Core ML/Data
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
transformers>=4.35.0
|
| 7 |
+
chronos-forecasting>=1.2.0
|
| 8 |
+
datasets>=2.14.0
|
| 9 |
+
polars>=0.19.0
|
| 10 |
+
pyarrow>=13.0.0
|
| 11 |
+
|
| 12 |
+
# HuggingFace
|
| 13 |
+
huggingface-hub>=0.19.0
|
| 14 |
+
|
| 15 |
+
# Visualization
|
| 16 |
+
altair>=5.0.0
|
| 17 |
+
vega-datasets
|
| 18 |
+
|
| 19 |
+
# Jupyter
|
| 20 |
+
ipykernel
|
| 21 |
+
jupyter
|
| 22 |
+
jupyterlab
|
| 23 |
+
|
| 24 |
+
# Utilities
|
| 25 |
+
python-dotenv
|
| 26 |
+
tqdm
|
start_server.sh
CHANGED
|
@@ -1,20 +1,19 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
-
|
| 3 |
JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
|
| 4 |
|
| 5 |
-
NOTEBOOK_DIR="/
|
| 6 |
|
| 7 |
jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
| 8 |
|
| 9 |
jupyter-lab \
|
| 10 |
-
--ip 0.0.0.0 \
|
| 11 |
-
--port 7860 \
|
| 12 |
-
--no-browser \
|
| 13 |
-
--allow-root \
|
| 14 |
-
--ServerApp.token="$JUPYTER_TOKEN" \
|
| 15 |
-
--ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
|
| 16 |
-
--ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
|
| 17 |
-
--ServerApp.disable_check_xsrf=True \
|
| 18 |
-
--LabApp.news_url=None \
|
| 19 |
-
--LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
|
| 20 |
-
--notebook-dir=$NOTEBOOK_DIR
|
|
|
|
| 1 |
#!/bin/bash
|
|
|
|
| 2 |
JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
|
| 3 |
|
| 4 |
+
NOTEBOOK_DIR="/data"
|
| 5 |
|
| 6 |
jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
| 7 |
|
| 8 |
jupyter-lab \
|
| 9 |
+
--ip 0.0.0.0 \
|
| 10 |
+
--port 7860 \
|
| 11 |
+
--no-browser \
|
| 12 |
+
--allow-root \
|
| 13 |
+
--ServerApp.token="$JUPYTER_TOKEN" \
|
| 14 |
+
--ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
|
| 15 |
+
--ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
|
| 16 |
+
--ServerApp.disable_check_xsrf=True \
|
| 17 |
+
--LabApp.news_url=None \
|
| 18 |
+
--LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
|
| 19 |
+
--notebook-dir=$NOTEBOOK_DIR
|
upload_to_hf.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Upload extended dataset to HuggingFace Datasets.
|
| 2 |
+
|
| 3 |
+
Uploads features_unified_extended.parquet (17,880 rows) to replace existing
|
| 4 |
+
24-month dataset (17,544 rows) on HuggingFace.
|
| 5 |
+
|
| 6 |
+
Dataset: evgueni-p/fbmc-features-24month
|
| 7 |
+
New date range: Oct 1, 2023 - Oct 14, 2025
|
| 8 |
+
|
| 9 |
+
Author: Claude
|
| 10 |
+
Date: 2025-11-14
|
| 11 |
+
"""
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import os
|
| 14 |
+
from datasets import Dataset
|
| 15 |
+
import polars as pl
|
| 16 |
+
from huggingface_hub import login
|
| 17 |
+
import sys
|
| 18 |
+
|
| 19 |
+
# Load environment variables from .env file
|
| 20 |
+
from dotenv import load_dotenv
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def upload_extended_dataset():
|
| 25 |
+
"""Upload extended dataset to HuggingFace."""
|
| 26 |
+
print("\n" + "=" * 80)
|
| 27 |
+
print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
|
| 28 |
+
print("=" * 80)
|
| 29 |
+
|
| 30 |
+
# Load HF token
|
| 31 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 32 |
+
if not hf_token:
|
| 33 |
+
raise ValueError("HF_TOKEN environment variable not set - check .env file")
|
| 34 |
+
|
| 35 |
+
# Login to HuggingFace
|
| 36 |
+
print("\nAuthenticating with HuggingFace...")
|
| 37 |
+
login(token=hf_token)
|
| 38 |
+
print(" [OK] Logged in")
|
| 39 |
+
|
| 40 |
+
# Load extended dataset
|
| 41 |
+
extended_file = Path("data/processed/features_unified_extended.parquet")
|
| 42 |
+
if not extended_file.exists():
|
| 43 |
+
raise FileNotFoundError(f"Extended dataset not found: {extended_file}")
|
| 44 |
+
|
| 45 |
+
print(f"\nLoading extended dataset...")
|
| 46 |
+
df = pl.read_parquet(extended_file)
|
| 47 |
+
print(f" Shape: {df.shape}")
|
| 48 |
+
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
|
| 49 |
+
print(f" File size: {extended_file.stat().st_size / 1024 / 1024:.1f} MB")
|
| 50 |
+
|
| 51 |
+
# Convert to HuggingFace Dataset
|
| 52 |
+
print("\nConverting to HuggingFace Dataset format...")
|
| 53 |
+
hf_dataset = Dataset.from_polars(df)
|
| 54 |
+
print(f" [OK] Converted: {hf_dataset}")
|
| 55 |
+
|
| 56 |
+
# Upload to HuggingFace
|
| 57 |
+
dataset_name = "evgueni-p/fbmc-features-24month"
|
| 58 |
+
print(f"\nUploading to HuggingFace: {dataset_name}")
|
| 59 |
+
print(" This may take a few minutes...")
|
| 60 |
+
|
| 61 |
+
hf_dataset.push_to_hub(
|
| 62 |
+
dataset_name,
|
| 63 |
+
token=hf_token,
|
| 64 |
+
private=False # Make public
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
print(f"\n[OK] Dataset uploaded successfully!")
|
| 68 |
+
print(f" URL: https://huggingface.co/datasets/{dataset_name}")
|
| 69 |
+
print(f" Rows: {len(hf_dataset)}")
|
| 70 |
+
print(f" Columns: {len(hf_dataset.column_names)}")
|
| 71 |
+
|
| 72 |
+
return dataset_name
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def verify_upload(dataset_name: str):
|
| 76 |
+
"""Verify uploaded dataset by downloading and checking shape."""
|
| 77 |
+
print("\n" + "=" * 80)
|
| 78 |
+
print("VERIFYING UPLOAD")
|
| 79 |
+
print("=" * 80)
|
| 80 |
+
|
| 81 |
+
from datasets import load_dataset
|
| 82 |
+
|
| 83 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 84 |
+
|
| 85 |
+
print(f"\nDownloading dataset from HuggingFace...")
|
| 86 |
+
print(f" Dataset: {dataset_name}")
|
| 87 |
+
|
| 88 |
+
downloaded = load_dataset(
|
| 89 |
+
dataset_name,
|
| 90 |
+
split="train",
|
| 91 |
+
token=hf_token
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
print(f"\n[OK] Downloaded successfully!")
|
| 95 |
+
print(f" Shape: {downloaded.shape}")
|
| 96 |
+
|
| 97 |
+
# Convert to Polars for inspection
|
| 98 |
+
df_check = pl.from_arrow(downloaded.data.table)
|
| 99 |
+
print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")
|
| 100 |
+
|
| 101 |
+
# Validate
|
| 102 |
+
expected_rows = 17880
|
| 103 |
+
expected_cols = 2553
|
| 104 |
+
|
| 105 |
+
issues = []
|
| 106 |
+
if downloaded.shape[0] != expected_rows:
|
| 107 |
+
issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
|
| 108 |
+
if downloaded.shape[1] != expected_cols:
|
| 109 |
+
issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")
|
| 110 |
+
|
| 111 |
+
if issues:
|
| 112 |
+
print("\n[WARNING] Validation issues:")
|
| 113 |
+
for issue in issues:
|
| 114 |
+
print(f" - {issue}")
|
| 115 |
+
return False
|
| 116 |
+
else:
|
| 117 |
+
print("\n[OK] Upload verified successfully!")
|
| 118 |
+
return True
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def main():
|
| 122 |
+
"""Main execution: Upload and verify extended dataset."""
|
| 123 |
+
print("\n" + "=" * 80)
|
| 124 |
+
print("HUGGINGFACE DATASET UPLOAD")
|
| 125 |
+
print("Uploading extended dataset (17,880 rows)")
|
| 126 |
+
print("=" * 80)
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
# Upload dataset
|
| 130 |
+
dataset_name = upload_extended_dataset()
|
| 131 |
+
|
| 132 |
+
# Verify upload
|
| 133 |
+
verification_passed = verify_upload(dataset_name)
|
| 134 |
+
|
| 135 |
+
if verification_passed:
|
| 136 |
+
print("\n" + "=" * 80)
|
| 137 |
+
print("SUCCESS: Dataset uploaded and verified!")
|
| 138 |
+
print("=" * 80)
|
| 139 |
+
print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
|
| 140 |
+
print("\nNext steps:")
|
| 141 |
+
print(" 1. Create inference notebooks (.ipynb)")
|
| 142 |
+
print(" 2. Create HF Space README.md")
|
| 143 |
+
print(" 3. Deploy notebooks to HF Space")
|
| 144 |
+
print(" 4. Test inference on GPU")
|
| 145 |
+
else:
|
| 146 |
+
print("\n[ERROR] Verification failed")
|
| 147 |
+
sys.exit(1)
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
|
| 151 |
+
print(f"\n[ERROR] Upload failed: {error_msg}")
|
| 152 |
+
import traceback
|
| 153 |
+
traceback.print_exc()
|
| 154 |
+
sys.exit(1)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
main()
|