Spaces:
Sleeping
Sleeping
| """Upload extended dataset to HuggingFace Datasets. | |
| Uploads features_unified_extended.parquet (17,880 rows) to replace existing | |
| 24-month dataset (17,544 rows) on HuggingFace. | |
| Dataset: evgueni-p/fbmc-features-24month | |
| New date range: Oct 1, 2023 - Oct 14, 2025 | |
| Author: Claude | |
| Date: 2025-11-14 | |
| """ | |
| from pathlib import Path | |
| import os | |
| from datasets import Dataset | |
| import polars as pl | |
| from huggingface_hub import login | |
| import sys | |
| # Load environment variables from .env file | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def upload_extended_dataset(): | |
| """Upload extended dataset to HuggingFace.""" | |
| print("\n" + "=" * 80) | |
| print("UPLOADING EXTENDED DATASET TO HUGGINGFACE") | |
| print("=" * 80) | |
| # Load HF token | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise ValueError("HF_TOKEN environment variable not set - check .env file") | |
| # Login to HuggingFace | |
| print("\nAuthenticating with HuggingFace...") | |
| login(token=hf_token) | |
| print(" [OK] Logged in") | |
| # Load unified dataset with volatility features | |
| unified_file = Path("data/processed/features_unified_24month.parquet") | |
| if not unified_file.exists(): | |
| raise FileNotFoundError(f"Unified dataset not found: {unified_file}") | |
| print(f"\nLoading unified dataset with volatility features...") | |
| df = pl.read_parquet(unified_file) | |
| print(f" Shape: {df.shape}") | |
| print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") | |
| print(f" File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB") | |
| # Convert to HuggingFace Dataset | |
| print("\nConverting to HuggingFace Dataset format...") | |
| hf_dataset = Dataset.from_polars(df) | |
| print(f" [OK] Converted: {hf_dataset}") | |
| # Upload to HuggingFace | |
| dataset_name = "evgueni-p/fbmc-features-24month" | |
| print(f"\nUploading to HuggingFace: {dataset_name}") | |
| print(" This may take a few minutes...") | |
| hf_dataset.push_to_hub( | |
| dataset_name, | |
| token=hf_token, | |
| private=False # Make public | |
| ) | |
| print(f"\n[OK] Dataset uploaded successfully!") | |
| print(f" URL: https://huggingface.co/datasets/{dataset_name}") | |
| print(f" Rows: {len(hf_dataset)}") | |
| print(f" Columns: {len(hf_dataset.column_names)}") | |
| return dataset_name | |
| def verify_upload(dataset_name: str): | |
| """Verify uploaded dataset by downloading and checking shape.""" | |
| print("\n" + "=" * 80) | |
| print("VERIFYING UPLOAD") | |
| print("=" * 80) | |
| from datasets import load_dataset | |
| hf_token = os.getenv("HF_TOKEN") | |
| print(f"\nDownloading dataset from HuggingFace...") | |
| print(f" Dataset: {dataset_name}") | |
| downloaded = load_dataset( | |
| dataset_name, | |
| split="train", | |
| token=hf_token | |
| ) | |
| print(f"\n[OK] Downloaded successfully!") | |
| print(f" Shape: {downloaded.shape}") | |
| # Convert to Polars for inspection | |
| df_check = pl.from_arrow(downloaded.data.table) | |
| print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}") | |
| # Validate | |
| expected_rows = 17880 | |
| expected_cols = 2553 | |
| issues = [] | |
| if downloaded.shape[0] != expected_rows: | |
| issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}") | |
| if downloaded.shape[1] != expected_cols: | |
| issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}") | |
| if issues: | |
| print("\n[WARNING] Validation issues:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| return False | |
| else: | |
| print("\n[OK] Upload verified successfully!") | |
| return True | |
| def main(): | |
| """Main execution: Upload and verify extended dataset.""" | |
| print("\n" + "=" * 80) | |
| print("HUGGINGFACE DATASET UPLOAD") | |
| print("Uploading extended dataset (17,880 rows)") | |
| print("=" * 80) | |
| try: | |
| # Upload dataset | |
| dataset_name = upload_extended_dataset() | |
| # Verify upload | |
| verification_passed = verify_upload(dataset_name) | |
| if verification_passed: | |
| print("\n" + "=" * 80) | |
| print("SUCCESS: Dataset uploaded and verified!") | |
| print("=" * 80) | |
| print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}") | |
| print("\nNext steps:") | |
| print(" 1. Create inference notebooks (.ipynb)") | |
| print(" 2. Create HF Space README.md") | |
| print(" 3. Deploy notebooks to HF Space") | |
| print(" 4. Test inference on GPU") | |
| else: | |
| print("\n[ERROR] Verification failed") | |
| sys.exit(1) | |
| except Exception as e: | |
| error_msg = str(e).encode('ascii', 'replace').decode('ascii') | |
| print(f"\n[ERROR] Upload failed: {error_msg}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |