"""Extend 24-month dataset with October 2025 features. Merges October feature files and appends to existing 24-month unified dataset. Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025) Author: Claude Date: 2025-11-14 """ from pathlib import Path import polars as pl import sys def merge_october_features() -> pl.DataFrame: """Merge October feature files into single dataframe.""" print("\n" + "=" * 80) print("MERGING OCTOBER FEATURES") print("=" * 80) processed_dir = Path("data/processed") # Load October feature files weather_file = processed_dir / "features_weather_october.parquet" entsoe_file = processed_dir / "features_entsoe_october.parquet" jao_file = processed_dir / "features_jao_october.parquet" print("\nLoading October features...") weather_df = pl.read_parquet(weather_file) # Cast timestamp to nanosecond precision for consistency weather_df = weather_df.with_columns([ pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp') ]) print(f" Weather: {weather_df.shape}") entsoe_df = pl.read_parquet(entsoe_file) # Ensure timestamp is nanosecond precision entsoe_df = entsoe_df.with_columns([ pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp') ]) print(f" ENTSO-E: {entsoe_df.shape}") # Check if JAO features exist if jao_file.exists(): jao_df = pl.read_parquet(jao_file) print(f" JAO: {jao_df.shape}") else: jao_df = None print(f" JAO: Not available (will use zeros)") # Merge features print("\nMerging features...") unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True) print(f" Weather + ENTSO-E: {unified.shape}") if jao_df is not None: unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True) print(f" + JAO: {unified.shape}") print(f"\n[OK] October unified features: {unified.shape}") return unified def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame: """Append October features to 24-month dataset.""" print("\n" + "=" * 80) print("EXTENDING 24-MONTH DATASET") print("=" * 80) processed_dir = Path("data/processed") base_file = processed_dir / "features_unified_24month.parquet" print("\nLoading 24-month dataset...") base_df = pl.read_parquet(base_file) print(f" Shape: {base_df.shape}") print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}") # Match October timestamp precision to base dataset base_timestamp_dtype = base_df['timestamp'].dtype october_features = october_features.with_columns([ pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp') ]) print(f" Matched timestamp precision: {base_timestamp_dtype}") # Get column lists base_cols = set(base_df.columns) october_cols = set(october_features.columns) # Find missing columns in October (JAO features likely missing) missing_in_october = base_cols - october_cols if missing_in_october: print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)") for col in missing_in_october: if col != 'timestamp': october_features = october_features.with_columns([ pl.lit(None).cast(base_df[col].dtype).alias(col) ]) # Ensure ALL column dtypes match exactly (not just missing ones) print("\n Matching column dtypes...") dtype_fixes = [] for col in base_df.columns: if col in october_features.columns: base_dtype = base_df[col].dtype october_dtype = october_features[col].dtype if base_dtype != october_dtype: dtype_fixes.append(col) october_features = october_features.with_columns([ pl.col(col).cast(base_dtype).alias(col) ]) if dtype_fixes: print(f" Fixed {len(dtype_fixes)} dtype mismatches") # Ensure column order matches october_features = october_features.select(base_df.columns) print("\nAppending October features...") extended_df = pl.concat([base_df, october_features], how='vertical') print(f" Extended shape: {extended_df.shape}") print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}") print(f" Rows added: {len(extended_df) - len(base_df)}") return extended_df def validate_extended_dataset(extended_df: pl.DataFrame): """Validate extended dataset.""" print("\n" + "=" * 80) print("VALIDATING EXTENDED DATASET") print("=" * 80) expected_rows = 17880 # 24 months + 14 days expected_cols = 2553 # From metadata print(f"\nShape validation:") print(f" Rows: {len(extended_df)} (expected {expected_rows})") print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})") # Check for duplicates duplicates = extended_df.filter(pl.col('timestamp').is_duplicated()) print(f"\nDuplicate timestamps: {len(duplicates)}") # Check for gaps (skip - Duration comparison not supported in this Polars version) # Just verify continuous hourly data by checking row count matches expected expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1 actual_hours = len(extended_df) print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})") # Null counts total_nulls = extended_df.null_count().sum_horizontal().to_list()[0] print(f"\nTotal null values: {total_nulls}") # Date range date_start = extended_df['timestamp'].min() date_end = extended_df['timestamp'].max() print(f"\nDate range:") print(f" Start: {date_start}") print(f" End: {date_end}") # Validation result issues = [] if len(extended_df) != expected_rows: issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}") if len(duplicates) > 0: issues.append(f"Found {len(duplicates)} duplicate timestamps") if issues: print("\n[WARNING] Validation issues:") for issue in issues: print(f" - {issue}") return False else: print("\n[OK] All validation checks passed!") return True def main(): """Main execution: Merge October features and extend dataset.""" print("\n" + "=" * 80) print("DATASET EXTENSION: October 2025") print("Extending 24-month dataset (17,544 -> 17,880 rows)") print("=" * 80) try: # Merge October features october_features = merge_october_features() # Extend dataset extended_df = extend_dataset(october_features) # Validate validation_passed = validate_extended_dataset(extended_df) if validation_passed: # Save extended dataset output_file = Path("data/processed/features_unified_extended.parquet") extended_df.write_parquet(output_file) print("\n" + "=" * 80) print("SUCCESS: Dataset extension complete!") print("=" * 80) print(f"\nExtended dataset saved:") print(f" File: {output_file}") print(f" Shape: {extended_df.shape}") print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB") print("\nNext steps:") print(" 1. Upload to HuggingFace Datasets") print(" 2. Create inference notebooks") print(" 3. Deploy to HF Space") else: print("\n[ERROR] Validation failed - please review issues") sys.exit(1) except Exception as e: error_msg = str(e).encode('ascii', 'replace').decode('ascii') print(f"\n[ERROR] Dataset extension failed: {error_msg}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()