fbmc-chronos2 / upload_to_hf.py
Evgueni Poloukarov
feat: add 396 volatility features for zero-shot forecast improvement
a57b996
"""Upload extended dataset to HuggingFace Datasets.
Uploads features_unified_extended.parquet (17,880 rows) to replace existing
24-month dataset (17,544 rows) on HuggingFace.
Dataset: evgueni-p/fbmc-features-24month
New date range: Oct 1, 2023 - Oct 14, 2025
Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import os
from datasets import Dataset
import polars as pl
from huggingface_hub import login
import sys
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
def upload_extended_dataset():
"""Upload extended dataset to HuggingFace."""
print("\n" + "=" * 80)
print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
print("=" * 80)
# Load HF token
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set - check .env file")
# Login to HuggingFace
print("\nAuthenticating with HuggingFace...")
login(token=hf_token)
print(" [OK] Logged in")
# Load unified dataset with volatility features
unified_file = Path("data/processed/features_unified_24month.parquet")
if not unified_file.exists():
raise FileNotFoundError(f"Unified dataset not found: {unified_file}")
print(f"\nLoading unified dataset with volatility features...")
df = pl.read_parquet(unified_file)
print(f" Shape: {df.shape}")
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f" File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB")
# Convert to HuggingFace Dataset
print("\nConverting to HuggingFace Dataset format...")
hf_dataset = Dataset.from_polars(df)
print(f" [OK] Converted: {hf_dataset}")
# Upload to HuggingFace
dataset_name = "evgueni-p/fbmc-features-24month"
print(f"\nUploading to HuggingFace: {dataset_name}")
print(" This may take a few minutes...")
hf_dataset.push_to_hub(
dataset_name,
token=hf_token,
private=False # Make public
)
print(f"\n[OK] Dataset uploaded successfully!")
print(f" URL: https://huggingface.co/datasets/{dataset_name}")
print(f" Rows: {len(hf_dataset)}")
print(f" Columns: {len(hf_dataset.column_names)}")
return dataset_name
def verify_upload(dataset_name: str):
"""Verify uploaded dataset by downloading and checking shape."""
print("\n" + "=" * 80)
print("VERIFYING UPLOAD")
print("=" * 80)
from datasets import load_dataset
hf_token = os.getenv("HF_TOKEN")
print(f"\nDownloading dataset from HuggingFace...")
print(f" Dataset: {dataset_name}")
downloaded = load_dataset(
dataset_name,
split="train",
token=hf_token
)
print(f"\n[OK] Downloaded successfully!")
print(f" Shape: {downloaded.shape}")
# Convert to Polars for inspection
df_check = pl.from_arrow(downloaded.data.table)
print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")
# Validate
expected_rows = 17880
expected_cols = 2553
issues = []
if downloaded.shape[0] != expected_rows:
issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
if downloaded.shape[1] != expected_cols:
issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")
if issues:
print("\n[WARNING] Validation issues:")
for issue in issues:
print(f" - {issue}")
return False
else:
print("\n[OK] Upload verified successfully!")
return True
def main():
"""Main execution: Upload and verify extended dataset."""
print("\n" + "=" * 80)
print("HUGGINGFACE DATASET UPLOAD")
print("Uploading extended dataset (17,880 rows)")
print("=" * 80)
try:
# Upload dataset
dataset_name = upload_extended_dataset()
# Verify upload
verification_passed = verify_upload(dataset_name)
if verification_passed:
print("\n" + "=" * 80)
print("SUCCESS: Dataset uploaded and verified!")
print("=" * 80)
print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
print("\nNext steps:")
print(" 1. Create inference notebooks (.ipynb)")
print(" 2. Create HF Space README.md")
print(" 3. Deploy notebooks to HF Space")
print(" 4. Test inference on GPU")
else:
print("\n[ERROR] Verification failed")
sys.exit(1)
except Exception as e:
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
print(f"\n[ERROR] Upload failed: {error_msg}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()