File size: 4,809 Bytes
330e408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a57b996
 
 
 
330e408
a57b996
 
330e408
 
a57b996
330e408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Upload extended dataset to HuggingFace Datasets.

Uploads features_unified_extended.parquet (17,880 rows) to replace existing
24-month dataset (17,544 rows) on HuggingFace.

Dataset: evgueni-p/fbmc-features-24month
New date range: Oct 1, 2023 - Oct 14, 2025

Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import os
from datasets import Dataset
import polars as pl
from huggingface_hub import login
import sys

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()


def upload_extended_dataset():
    """Upload extended dataset to HuggingFace."""
    print("\n" + "=" * 80)
    print("UPLOADING EXTENDED DATASET TO HUGGINGFACE")
    print("=" * 80)

    # Load HF token
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not set - check .env file")

    # Login to HuggingFace
    print("\nAuthenticating with HuggingFace...")
    login(token=hf_token)
    print("  [OK] Logged in")

    # Load unified dataset with volatility features
    unified_file = Path("data/processed/features_unified_24month.parquet")
    if not unified_file.exists():
        raise FileNotFoundError(f"Unified dataset not found: {unified_file}")

    print(f"\nLoading unified dataset with volatility features...")
    df = pl.read_parquet(unified_file)
    print(f"  Shape: {df.shape}")
    print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"  File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB")

    # Convert to HuggingFace Dataset
    print("\nConverting to HuggingFace Dataset format...")
    hf_dataset = Dataset.from_polars(df)
    print(f"  [OK] Converted: {hf_dataset}")

    # Upload to HuggingFace
    dataset_name = "evgueni-p/fbmc-features-24month"
    print(f"\nUploading to HuggingFace: {dataset_name}")
    print("  This may take a few minutes...")

    hf_dataset.push_to_hub(
        dataset_name,
        token=hf_token,
        private=False  # Make public
    )

    print(f"\n[OK] Dataset uploaded successfully!")
    print(f"  URL: https://huggingface.co/datasets/{dataset_name}")
    print(f"  Rows: {len(hf_dataset)}")
    print(f"  Columns: {len(hf_dataset.column_names)}")

    return dataset_name


def verify_upload(dataset_name: str):
    """Verify uploaded dataset by downloading and checking shape."""
    print("\n" + "=" * 80)
    print("VERIFYING UPLOAD")
    print("=" * 80)

    from datasets import load_dataset

    hf_token = os.getenv("HF_TOKEN")

    print(f"\nDownloading dataset from HuggingFace...")
    print(f"  Dataset: {dataset_name}")

    downloaded = load_dataset(
        dataset_name,
        split="train",
        token=hf_token
    )

    print(f"\n[OK] Downloaded successfully!")
    print(f"  Shape: {downloaded.shape}")

    # Convert to Polars for inspection
    df_check = pl.from_arrow(downloaded.data.table)
    print(f"  Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}")

    # Validate
    expected_rows = 17880
    expected_cols = 2553

    issues = []
    if downloaded.shape[0] != expected_rows:
        issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}")
    if downloaded.shape[1] != expected_cols:
        issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}")

    if issues:
        print("\n[WARNING] Validation issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print("\n[OK] Upload verified successfully!")
        return True


def main():
    """Main execution: Upload and verify extended dataset."""
    print("\n" + "=" * 80)
    print("HUGGINGFACE DATASET UPLOAD")
    print("Uploading extended dataset (17,880 rows)")
    print("=" * 80)

    try:
        # Upload dataset
        dataset_name = upload_extended_dataset()

        # Verify upload
        verification_passed = verify_upload(dataset_name)

        if verification_passed:
            print("\n" + "=" * 80)
            print("SUCCESS: Dataset uploaded and verified!")
            print("=" * 80)
            print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}")
            print("\nNext steps:")
            print("  1. Create inference notebooks (.ipynb)")
            print("  2. Create HF Space README.md")
            print("  3. Deploy notebooks to HF Space")
            print("  4. Test inference on GPU")
        else:
            print("\n[ERROR] Verification failed")
            sys.exit(1)

    except Exception as e:
        error_msg = str(e).encode('ascii', 'replace').decode('ascii')
        print(f"\n[ERROR] Upload failed: {error_msg}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()