Spaces:
Sleeping
Sleeping
| """Extend 24-month dataset with October 2025 features. | |
| Merges October feature files and appends to existing 24-month unified dataset. | |
| Creates extended dataset: 17,544 + 336 = 17,880 rows (Oct 2023 - Oct 14, 2025) | |
| Author: Claude | |
| Date: 2025-11-14 | |
| """ | |
| from pathlib import Path | |
| import polars as pl | |
| import sys | |
| def merge_october_features() -> pl.DataFrame: | |
| """Merge October feature files into single dataframe.""" | |
| print("\n" + "=" * 80) | |
| print("MERGING OCTOBER FEATURES") | |
| print("=" * 80) | |
| processed_dir = Path("data/processed") | |
| # Load October feature files | |
| weather_file = processed_dir / "features_weather_october.parquet" | |
| entsoe_file = processed_dir / "features_entsoe_october.parquet" | |
| jao_file = processed_dir / "features_jao_october.parquet" | |
| print("\nLoading October features...") | |
| weather_df = pl.read_parquet(weather_file) | |
| # Cast timestamp to nanosecond precision for consistency | |
| weather_df = weather_df.with_columns([ | |
| pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp') | |
| ]) | |
| print(f" Weather: {weather_df.shape}") | |
| entsoe_df = pl.read_parquet(entsoe_file) | |
| # Ensure timestamp is nanosecond precision | |
| entsoe_df = entsoe_df.with_columns([ | |
| pl.col('timestamp').dt.cast_time_unit('ns').alias('timestamp') | |
| ]) | |
| print(f" ENTSO-E: {entsoe_df.shape}") | |
| # Check if JAO features exist | |
| if jao_file.exists(): | |
| jao_df = pl.read_parquet(jao_file) | |
| print(f" JAO: {jao_df.shape}") | |
| else: | |
| jao_df = None | |
| print(f" JAO: Not available (will use zeros)") | |
| # Merge features | |
| print("\nMerging features...") | |
| unified = weather_df.join(entsoe_df, on='timestamp', how='left', coalesce=True) | |
| print(f" Weather + ENTSO-E: {unified.shape}") | |
| if jao_df is not None: | |
| unified = unified.join(jao_df, on='timestamp', how='left', coalesce=True) | |
| print(f" + JAO: {unified.shape}") | |
| print(f"\n[OK] October unified features: {unified.shape}") | |
| return unified | |
| def extend_dataset(october_features: pl.DataFrame) -> pl.DataFrame: | |
| """Append October features to 24-month dataset.""" | |
| print("\n" + "=" * 80) | |
| print("EXTENDING 24-MONTH DATASET") | |
| print("=" * 80) | |
| processed_dir = Path("data/processed") | |
| base_file = processed_dir / "features_unified_24month.parquet" | |
| print("\nLoading 24-month dataset...") | |
| base_df = pl.read_parquet(base_file) | |
| print(f" Shape: {base_df.shape}") | |
| print(f" Date range: {base_df['timestamp'].min()} to {base_df['timestamp'].max()}") | |
| # Match October timestamp precision to base dataset | |
| base_timestamp_dtype = base_df['timestamp'].dtype | |
| october_features = october_features.with_columns([ | |
| pl.col('timestamp').cast(base_timestamp_dtype).alias('timestamp') | |
| ]) | |
| print(f" Matched timestamp precision: {base_timestamp_dtype}") | |
| # Get column lists | |
| base_cols = set(base_df.columns) | |
| october_cols = set(october_features.columns) | |
| # Find missing columns in October (JAO features likely missing) | |
| missing_in_october = base_cols - october_cols | |
| if missing_in_october: | |
| print(f"\n Adding {len(missing_in_october)} missing columns to October (fill with nulls)") | |
| for col in missing_in_october: | |
| if col != 'timestamp': | |
| october_features = october_features.with_columns([ | |
| pl.lit(None).cast(base_df[col].dtype).alias(col) | |
| ]) | |
| # Ensure ALL column dtypes match exactly (not just missing ones) | |
| print("\n Matching column dtypes...") | |
| dtype_fixes = [] | |
| for col in base_df.columns: | |
| if col in october_features.columns: | |
| base_dtype = base_df[col].dtype | |
| october_dtype = october_features[col].dtype | |
| if base_dtype != october_dtype: | |
| dtype_fixes.append(col) | |
| october_features = october_features.with_columns([ | |
| pl.col(col).cast(base_dtype).alias(col) | |
| ]) | |
| if dtype_fixes: | |
| print(f" Fixed {len(dtype_fixes)} dtype mismatches") | |
| # Ensure column order matches | |
| october_features = october_features.select(base_df.columns) | |
| print("\nAppending October features...") | |
| extended_df = pl.concat([base_df, october_features], how='vertical') | |
| print(f" Extended shape: {extended_df.shape}") | |
| print(f" Date range: {extended_df['timestamp'].min()} to {extended_df['timestamp'].max()}") | |
| print(f" Rows added: {len(extended_df) - len(base_df)}") | |
| return extended_df | |
| def validate_extended_dataset(extended_df: pl.DataFrame): | |
| """Validate extended dataset.""" | |
| print("\n" + "=" * 80) | |
| print("VALIDATING EXTENDED DATASET") | |
| print("=" * 80) | |
| expected_rows = 17880 # 24 months + 14 days | |
| expected_cols = 2553 # From metadata | |
| print(f"\nShape validation:") | |
| print(f" Rows: {len(extended_df)} (expected {expected_rows})") | |
| print(f" Columns: {len(extended_df.columns)} (expected {expected_cols})") | |
| # Check for duplicates | |
| duplicates = extended_df.filter(pl.col('timestamp').is_duplicated()) | |
| print(f"\nDuplicate timestamps: {len(duplicates)}") | |
| # Check for gaps (skip - Duration comparison not supported in this Polars version) | |
| # Just verify continuous hourly data by checking row count matches expected | |
| expected_hours = (extended_df['timestamp'].max() - extended_df['timestamp'].min()).total_seconds() / 3600 + 1 | |
| actual_hours = len(extended_df) | |
| print(f"Time continuity: {actual_hours} hours (expected ~{int(expected_hours)})") | |
| # Null counts | |
| total_nulls = extended_df.null_count().sum_horizontal().to_list()[0] | |
| print(f"\nTotal null values: {total_nulls}") | |
| # Date range | |
| date_start = extended_df['timestamp'].min() | |
| date_end = extended_df['timestamp'].max() | |
| print(f"\nDate range:") | |
| print(f" Start: {date_start}") | |
| print(f" End: {date_end}") | |
| # Validation result | |
| issues = [] | |
| if len(extended_df) != expected_rows: | |
| issues.append(f"Row count mismatch: {len(extended_df)} != {expected_rows}") | |
| if len(duplicates) > 0: | |
| issues.append(f"Found {len(duplicates)} duplicate timestamps") | |
| if issues: | |
| print("\n[WARNING] Validation issues:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| return False | |
| else: | |
| print("\n[OK] All validation checks passed!") | |
| return True | |
| def main(): | |
| """Main execution: Merge October features and extend dataset.""" | |
| print("\n" + "=" * 80) | |
| print("DATASET EXTENSION: October 2025") | |
| print("Extending 24-month dataset (17,544 -> 17,880 rows)") | |
| print("=" * 80) | |
| try: | |
| # Merge October features | |
| october_features = merge_october_features() | |
| # Extend dataset | |
| extended_df = extend_dataset(october_features) | |
| # Validate | |
| validation_passed = validate_extended_dataset(extended_df) | |
| if validation_passed: | |
| # Save extended dataset | |
| output_file = Path("data/processed/features_unified_extended.parquet") | |
| extended_df.write_parquet(output_file) | |
| print("\n" + "=" * 80) | |
| print("SUCCESS: Dataset extension complete!") | |
| print("=" * 80) | |
| print(f"\nExtended dataset saved:") | |
| print(f" File: {output_file}") | |
| print(f" Shape: {extended_df.shape}") | |
| print(f" Size: {output_file.stat().st_size / 1024 / 1024:.1f} MB") | |
| print("\nNext steps:") | |
| print(" 1. Upload to HuggingFace Datasets") | |
| print(" 2. Create inference notebooks") | |
| print(" 3. Deploy to HF Space") | |
| else: | |
| print("\n[ERROR] Validation failed - please review issues") | |
| sys.exit(1) | |
| except Exception as e: | |
| error_msg = str(e).encode('ascii', 'replace').decode('ascii') | |
| print(f"\n[ERROR] Dataset extension failed: {error_msg}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |