fbmc-chronos2 / extend_dataset.py
Evgueni Poloukarov
chore: merge with HF Space template - keep our README and requirements
330e408
"""Extend 24-month dataset with October 2025 features.
Merges October feature files and appends to existing 24-month unified dataset.
Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025)
Author: Claude
Date: 2025-11-14
"""
from pathlib import Path
import polars as pl
import sys
def merge_october_features() -> pl.DataFrame:
"""Merge October feature files into single dataframe."""
print("\n" + "=" * 80)
print("MERGING OCTOBER FEATURES")
print("=" * 80)
processed_dir = Path("data/processed")
# Load October feature files
weather_file = processed_dir / "features_weather_october.parquet"
entsoe_file = processed_dir / "features_entsoe_october.parquet"
jao_file = processed_dir / "features_jao_october.parquet"
print("\nLoading October features...")
weather_df = pl.read_parquet(weather_file)
# Cast timestamp to nanosecond precision for consistency
weather_df = weather_df.with_columns([
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
])
print(f" Weather: {weather_df.shape}")
entsoe_df = pl.read_parquet(entsoe_file)
# Ensure timestamp is nanosecond precision
entsoe_df = entsoe_df.with_columns([
pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp')
])
print(f" ENTSO-E: {entsoe_df.shape}")
# Check if JAO features exist
if jao_file.exists():
jao_df = pl.read_parquet(jao_file)
print(f" JAO: {jao_df.shape}")
else:
jao_df = None
print(f" JAO: Not available (will use zeros)")
# Merge features
print("\nMerging features...")
unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True)
print(f" Weather + ENTSO-E: {unified.shape}")
if jao_df is not None:
unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True)
print(f" + JAO: {unified.shape}")
print(f"\n[OK] October unified features: {unified.shape}")
return unified
def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame:
"""Append October features to 24-month dataset."""
print("\n" + "=" * 80)
print("EXTENDING 24-MONTH DATASET")
print("=" * 80)
processed_dir = Path("data/processed")
base_file = processed_dir / "features_unified_24month.parquet"
print("\nLoading 24-month dataset...")
base_df = pl.read_parquet(base_file)
print(f" Shape: {base_df.shape}")
print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}")
# Match October timestamp precision to base dataset
base_timestamp_dtype = base_df['timestamp'].dtype
october_features = october_features.with_columns([
pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp')
])
print(f" Matched timestamp precision: {base_timestamp_dtype}")
# Get column lists
base_cols = set(base_df.columns)
october_cols = set(october_features.columns)
# Find missing columns in October (JAO features likely missing)
missing_in_october = base_cols - october_cols
if missing_in_october:
print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)")
for col in missing_in_october:
if col != 'timestamp':
october_features = october_features.with_columns([
pl.lit(None).cast(base_df[col].dtype).alias(col)
])
# Ensure ALL column dtypes match exactly (not just missing ones)
print("\n Matching column dtypes...")
dtype_fixes = []
for col in base_df.columns:
if col in october_features.columns:
base_dtype = base_df[col].dtype
october_dtype = october_features[col].dtype
if base_dtype != october_dtype:
dtype_fixes.append(col)
october_features = october_features.with_columns([
pl.col(col).cast(base_dtype).alias(col)
])
if dtype_fixes:
print(f" Fixed {len(dtype_fixes)} dtype mismatches")
# Ensure column order matches
october_features = october_features.select(base_df.columns)
print("\nAppending October features...")
extended_df = pl.concat([base_df, october_features], how='vertical')
print(f" Extended shape: {extended_df.shape}")
print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}")
print(f" Rows added: {len(extended_df) - len(base_df)}")
return extended_df
def validate_extended_dataset(extended_df: pl.DataFrame):
"""Validate extended dataset."""
print("\n" + "=" * 80)
print("VALIDATING EXTENDED DATASET")
print("=" * 80)
expected_rows = 17880 # 24 months + 14 days
expected_cols = 2553 # From metadata
print(f"\nShape validation:")
print(f" Rows: {len(extended_df)} (expected {expected_rows})")
print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})")
# Check for duplicates
duplicates = extended_df.filter(pl.col('timestamp').is_duplicated())
print(f"\nDuplicate timestamps: {len(duplicates)}")
# Check for gaps (skip - Duration comparison not supported in this Polars version)
# Just verify continuous hourly data by checking row count matches expected
expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1
actual_hours = len(extended_df)
print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})")
# Null counts
total_nulls = extended_df.null_count().sum_horizontal().to_list()[0]
print(f"\nTotal null values: {total_nulls}")
# Date range
date_start = extended_df['timestamp'].min()
date_end = extended_df['timestamp'].max()
print(f"\nDate range:")
print(f" Start: {date_start}")
print(f" End: {date_end}")
# Validation result
issues = []
if len(extended_df) != expected_rows:
issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}")
if len(duplicates) > 0:
issues.append(f"Found {len(duplicates)} duplicate timestamps")
if issues:
print("\n[WARNING] Validation issues:")
for issue in issues:
print(f" - {issue}")
return False
else:
print("\n[OK] All validation checks passed!")
return True
def main():
"""Main execution: Merge October features and extend dataset."""
print("\n" + "=" * 80)
print("DATASET EXTENSION: October 2025")
print("Extending 24-month dataset (17,544 -> 17,880 rows)")
print("=" * 80)
try:
# Merge October features
october_features = merge_october_features()
# Extend dataset
extended_df = extend_dataset(october_features)
# Validate
validation_passed = validate_extended_dataset(extended_df)
if validation_passed:
# Save extended dataset
output_file = Path("data/processed/features_unified_extended.parquet")
extended_df.write_parquet(output_file)
print("\n" + "=" * 80)
print("SUCCESS: Dataset extension complete!")
print("=" * 80)
print(f"\nExtended dataset saved:")
print(f" File: {output_file}")
print(f" Shape: {extended_df.shape}")
print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
print("\nNext steps:")
print(" 1. Upload to HuggingFace Datasets")
print(" 2. Create inference notebooks")
print(" 3. Deploy to HF Space")
else:
print("\n[ERROR] Validation failed - please review issues")
sys.exit(1)
except Exception as e:
error_msg = str(e).encode('ascii', 'replace').decode('ascii')
print(f"\n[ERROR] Dataset extension failed: {error_msg}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()