Spaces:
Sleeping
Sleeping
File size: 4,809 Bytes
330e408 a57b996 330e408 a57b996 330e408 a57b996 330e408 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
"""Upload extended dataset to HuggingFace Datasets.
Uploads features_unified_extended.parquet (17,880 rows) to replace existing
24-month dataset (17,544 rows) on HuggingFace.
Dataset: evgueni-p/fbmc-features-24month
New date range: Oct 1, 2023 - Oct 14, 2025
Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import os
from datasets import Dataset
import polars as pl
from huggingface_hub import login
import sys
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
def upload_extended_dataset():
"""Upload extended dataset to HuggingFace."""
print("\n" + "=" * 80)
print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
print("=" * 80)
# Load HF token
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set - check .env file")
# Login to HuggingFace
print("\nAuthenticating with HuggingFace...")
login(token=hf_token)
print(" [OK] Logged in")
# Load unified dataset with volatility features
unified_file = Path("data/processed/features_unified_24month.parquet")
if not unified_file.exists():
raise FileNotFoundError(f"Unified dataset not found: {unified_file}")
print(f"\nLoading unified dataset with volatility features...")
df = pl.read_parquet(unified_file)
print(f" Shape: {df.shape}")
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f" File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB")
# Convert to HuggingFace Dataset
print("\nConverting to HuggingFace Dataset format...")
hf_dataset = Dataset.from_polars(df)
print(f" [OK] Converted: {hf_dataset}")
# Upload to HuggingFace
dataset_name = "evgueni-p/fbmc-features-24month"
print(f"\nUploading to HuggingFace: {dataset_name}")
print(" This may take a few minutes...")
hf_dataset.push_to_hub(
dataset_name,
token=hf_token,
private=False # Make public
)
print(f"\n[OK] Dataset uploaded successfully!")
print(f" URL: https://huggingface.co/datasets/{dataset_name}")
print(f" Rows: {len(hf_dataset)}")
print(f" Columns: {len(hf_dataset.column_names)}")
return dataset_name
def verify_upload(dataset_name: str):
"""Verify uploaded dataset by downloading and checking shape."""
print("\n" + "=" * 80)
print("VERIFYING UPLOAD")
print("=" * 80)
from datasets import load_dataset
hf_token = os.getenv("HF_TOKEN")
print(f"\nDownloading dataset from HuggingFace...")
print(f" Dataset: {dataset_name}")
downloaded = load_dataset(
dataset_name,
split="train",
token=hf_token
)
print(f"\n[OK] Downloaded successfully!")
print(f" Shape: {downloaded.shape}")
# Convert to Polars for inspection
df_check = pl.from_arrow(downloaded.data.table)
print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")
# Validate
expected_rows = 17880
expected_cols = 2553
issues = []
if downloaded.shape[0] != expected_rows:
issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
if downloaded.shape[1] != expected_cols:
issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")
if issues:
print("\n[WARNING] Validation issues:")
for issue in issues:
print(f" - {issue}")
return False
else:
print("\n[OK] Upload verified successfully!")
return True
def main():
"""Main execution: Upload and verify extended dataset."""
print("\n" + "=" * 80)
print("HUGGINGFACE DATASET UPLOAD")
print("Uploading extended dataset (17,880 rows)")
print("=" * 80)
try:
# Upload dataset
dataset_name = upload_extended_dataset()
# Verify upload
verification_passed = verify_upload(dataset_name)
if verification_passed:
print("\n" + "=" * 80)
print("SUCCESS: Dataset uploaded and verified!")
print("=" * 80)
print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
print("\nNext steps:")
print(" 1. Create inference notebooks (.ipynb)")
print(" 2. Create HF Space README.md")
print(" 3. Deploy notebooks to HF Space")
print(" 4. Test inference on GPU")
else:
print("\n[ERROR] Verification failed")
sys.exit(1)
except Exception as e:
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
print(f"\n[ERROR] Upload failed: {error_msg}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
|