Spaces:
Sleeping
Sleeping
| """FBMC Flow Forecasting - Unified JAO Data Exploration | |
| Objective: Explore unified 24-month JAO data and engineered features | |
| This notebook explores: | |
| 1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos) | |
| 2. Engineered features (726 features across 5 categories) | |
| 3. Feature completeness and validation | |
| 4. Key statistics and distributions | |
| Usage: | |
| marimo edit notebooks/02_unified_jao_exploration.py | |
| """ | |
| import marimo | |
| __generated_with = "0.17.2" | |
| app = marimo.App(width="medium") | |
| def _(): | |
| import marimo as mo | |
| import polars as pl | |
| import altair as alt | |
| from pathlib import Path | |
| import numpy as np | |
| return Path, alt, mo, pl | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| # Unified JAO Data Exploration (24 Months) | |
| **Date Range**: October 2023 - October 2025 (24 months) | |
| ## Data Pipeline Overview: | |
| 1. **Raw JAO Data** (4 datasets) | |
| - MaxBEX: Maximum Bilateral Exchange capacity (TARGET) | |
| - CNEC/PTDF: Critical constraints with power transfer factors | |
| - LTA: Long Term Allocations (future covariates) | |
| - Net Positions: Domain boundaries (min/max per zone) | |
| 2. **Data Unification** → `unified_jao_24month.parquet` | |
| - Deduplicated NetPos (removed 1,152 duplicate timestamps) | |
| - Forward-filled LTA gaps (710 missing hours) | |
| - Broadcast daily CNEC to hourly | |
| - Sorted timeline (hourly, 17,544 records) | |
| 3. **Feature Engineering** → `features_jao_24month.parquet` | |
| - 726 features across 5 categories | |
| - Tier-1 CNEC: 274 features | |
| - Tier-2 CNEC: 390 features | |
| - LTA: 40 features | |
| - Temporal: 12 features | |
| - Targets: 10 features | |
| """ | |
| ) | |
| return | |
| def _(Path, pl): | |
| # Load unified datasets | |
| print("Loading unified JAO datasets...") | |
| processed_dir = Path('data/processed') | |
| unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet') | |
| cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet') | |
| features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet') | |
| print(f"[OK] Unified JAO: {unified_jao.shape}") | |
| print(f"[OK] CNEC hourly: {cnec_hourly.shape}") | |
| print(f"[OK] Features: {features_jao.shape}") | |
| return features_jao, unified_jao | |
| def _(features_jao, mo, unified_jao): | |
| # Dataset overview | |
| mo.md(f""" | |
| ## Dataset Overview | |
| ### 1. Unified JAO Dataset | |
| - **Shape**: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns | |
| - **Date Range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()} | |
| - **Timeline Sorted**: {unified_jao['mtu'].is_sorted()} | |
| - **Null Percentage**: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}% | |
| ### 2. Engineered Features | |
| - **Shape**: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns | |
| - **Total Features**: {features_jao.shape[1] - 1} (excluding mtu timestamp) | |
| - **Null Percentage**: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}% | |
| - _Note: High nulls expected due to sparse CNEC binding patterns and lag features_ | |
| """) | |
| return | |
| def _(mo): | |
| mo.md("""## 1. Unified JAO Dataset Structure""") | |
| return | |
| def _(mo, unified_jao): | |
| # Show sample of unified data | |
| mo.md("""### Sample Data (First 20 Rows)""") | |
| mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10) | |
| return | |
| def _(mo, unified_jao): | |
| # Column breakdown | |
| maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')] | |
| lta_cols = [c for c in unified_jao.columns if c.startswith('border_')] | |
| netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')] | |
| mo.md(f""" | |
| ### Column Breakdown | |
| - **Timestamp**: 1 column (`mtu`) | |
| - **MaxBEX Borders**: {len(maxbex_cols)} columns | |
| - **LTA Borders**: {len(lta_cols)} columns | |
| - **Net Positions**: {len(netpos_cols)} columns (if present) | |
| - **Total**: {unified_jao.shape[1]} columns | |
| """) | |
| return | |
| def _(mo): | |
| mo.md("""### Timeline Validation""") | |
| return | |
| def _(alt, pl, unified_jao): | |
| # Timeline validation | |
| time_diffs = unified_jao['mtu'].diff().drop_nulls() | |
| # Most common time diff | |
| most_common = time_diffs.mode()[0] | |
| is_hourly = most_common.total_seconds() == 3600 | |
| # Create histogram of time diffs | |
| time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64) | |
| time_diff_df = pl.DataFrame({ | |
| 'time_diff_hours': time_diff_hours | |
| }) | |
| timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode( | |
| x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'), | |
| y=alt.Y('count()', title='Count'), | |
| tooltip=['time_diff_hours:Q', 'count()'] | |
| ).properties( | |
| title='Timeline Gaps Distribution', | |
| width=800, | |
| height=300 | |
| ) | |
| timeline_chart | |
| return is_hourly, most_common | |
| def _(is_hourly, mo, most_common): | |
| if is_hourly: | |
| mo.md(f""" | |
| ✅ **Timeline Validation: PASS** | |
| - Most common time diff: {most_common} (1 hour) | |
| - Timeline is properly sorted and hourly | |
| """) | |
| else: | |
| mo.md(f""" | |
| ⚠️ **Timeline Validation: WARNING** | |
| - Most common time diff: {most_common} | |
| - Expected: 1 hour | |
| """) | |
| return | |
| def _(mo): | |
| mo.md("""## 2. Feature Engineering Results""") | |
| return | |
| def _(features_jao, mo, pl): | |
| # Feature category breakdown | |
| tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')] | |
| tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')] | |
| lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')] | |
| temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] | |
| target_cols = [c for c in features_jao.columns if c.startswith('target_')] | |
| # Create summary table | |
| feature_summary = pl.DataFrame({ | |
| 'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'], | |
| 'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1], | |
| 'Null %': [ | |
| f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A", | |
| f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A", | |
| f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A", | |
| f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A", | |
| f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A", | |
| f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%" | |
| ] | |
| }) | |
| mo.ui.table(feature_summary.to_pandas()) | |
| return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols | |
| def _(mo): | |
| mo.md("""### Sample Features (First 20 Rows)""") | |
| return | |
| def _(features_jao, mo): | |
| # Show first 10 columns only (too many to display all) | |
| mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10) | |
| return | |
| def _(mo): | |
| mo.md("""## 3. LTA Features (Future Covariates)""") | |
| return | |
| def _(lta_feat_cols, mo): | |
| # LTA features analysis | |
| mo.md(f""" | |
| **LTA Features**: {len(lta_feat_cols)} features | |
| LTA (Long Term Allocations) are **future covariates** - known years in advance via auctions. | |
| These should have **0% nulls** since they're available for the entire forecast horizon. | |
| """) | |
| return | |
| def _(alt, features_jao): | |
| # Plot LTA total allocated over time | |
| lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu') | |
| lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode( | |
| x=alt.X('mtu:T', title='Date'), | |
| y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'), | |
| tooltip=['mtu:T', 'lta_total_allocated:Q'] | |
| ).properties( | |
| title='LTA Total Allocated Capacity Over Time', | |
| width=800, | |
| height=400 | |
| ).interactive() | |
| lta_chart | |
| return | |
| def _(features_jao, lta_feat_cols, mo): | |
| # LTA statistics | |
| lta_stats = features_jao.select(lta_feat_cols[:5]).describe() | |
| mo.md("""### LTA Sample Statistics (First 5 Features)""") | |
| mo.ui.table(lta_stats.to_pandas()) | |
| return | |
| def _(mo): | |
| mo.md("""## 4. Temporal Features""") | |
| return | |
| def _(features_jao, mo, temporal_cols): | |
| # Show temporal features | |
| mo.md(f""" | |
| **Temporal Features**: {len(temporal_cols)} features | |
| Cyclic encoding for hour, month, and weekday to capture periodicity. | |
| """) | |
| mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas()) | |
| return | |
| def _(alt, features_jao, pl): | |
| # Hourly distribution | |
| hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour') | |
| hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode( | |
| x=alt.X('hour:O', title='Hour of Day'), | |
| y=alt.Y('count:Q', title='Count'), | |
| tooltip=['hour:O', 'count:Q'] | |
| ).properties( | |
| title='Distribution by Hour of Day', | |
| width=800, | |
| height=300 | |
| ) | |
| hour_chart | |
| return | |
| def _(mo): | |
| mo.md("""## 5. CNEC Features (Historical)""") | |
| return | |
| def _(features_jao, mo, tier1_cols, tier2_cols): | |
| # CNEC features overview | |
| mo.md(f""" | |
| **CNEC Features**: {len(tier1_cols) + len(tier2_cols)} total | |
| - **Tier-1 CNECs**: {len(tier1_cols)} features (top 58 most critical CNECs) | |
| - **Tier-2 CNECs**: {len(tier2_cols)} features (next 150 CNECs) | |
| High null percentage is **expected** due to: | |
| 1. Sparse binding patterns (not all CNECs bind every hour) | |
| 2. Lag features create nulls at timeline start | |
| 3. Pivoting creates sparse constraint matrices | |
| """) | |
| # Sample Tier-1 features | |
| mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10) | |
| return | |
| def _(alt, features_jao, pl, tier1_cols): | |
| # Binding frequency for sample Tier-1 CNECs | |
| binding_cols = [c for c in tier1_cols if 'binding_' in c][:10] | |
| if binding_cols: | |
| binding_freq = pl.DataFrame({ | |
| 'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols], | |
| 'binding_rate': [features_jao[c].mean() for c in binding_cols] | |
| }) | |
| binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode( | |
| x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'), | |
| y=alt.Y('cnec:N', sort='-x', title='CNEC'), | |
| tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')] | |
| ).properties( | |
| title='Binding Frequency - Sample Tier-1 CNECs', | |
| width=800, | |
| height=300 | |
| ) | |
| binding_chart | |
| else: | |
| None | |
| return | |
| def _(mo): | |
| mo.md("""## 6. Target Variables""") | |
| return | |
| def _(features_jao, mo, target_cols): | |
| # Show target variables (MaxBEX borders) | |
| mo.md(f""" | |
| **Target Variables**: {len(target_cols)} features | |
| Sample MaxBEX borders for forecasting (first 10 borders): | |
| """) | |
| if target_cols: | |
| mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10) | |
| return | |
| def _(alt, features_jao, target_cols): | |
| # Plot sample target variable over time | |
| if target_cols: | |
| sample_target = target_cols[0] | |
| target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu') | |
| target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode( | |
| x=alt.X('mtu:T', title='Date'), | |
| y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'), | |
| tooltip=['mtu:T', f'{sample_target}:Q'] | |
| ).properties( | |
| title=f'Target Variable Over Time: {sample_target}', | |
| width=800, | |
| height=400 | |
| ).interactive() | |
| target_chart | |
| else: | |
| None | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## 7. Data Quality Summary | |
| Final validation checks: | |
| """ | |
| ) | |
| return | |
| def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao): | |
| # Data quality checks | |
| checks = [] | |
| # Check 1: Timeline sorted and hourly | |
| checks.append({ | |
| 'Check': 'Timeline sorted & hourly', | |
| 'Status': 'PASS' if is_hourly else 'FAIL', | |
| 'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}' | |
| }) | |
| # Check 2: No nulls in unified dataset | |
| unified_nulls = unified_jao.null_count().sum_horizontal()[0] | |
| checks.append({ | |
| 'Check': 'Unified data completeness', | |
| 'Status': 'PASS' if unified_nulls == 0 else 'WARNING', | |
| 'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)' | |
| }) | |
| # Check 3: LTA features have no nulls (future covariates) | |
| lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0 | |
| checks.append({ | |
| 'Check': 'LTA future covariates complete', | |
| 'Status': 'PASS' if lta_nulls == 0 else 'FAIL', | |
| 'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features' | |
| }) | |
| # Check 4: Data consistency (same row count) | |
| checks.append({ | |
| 'Check': 'Data consistency', | |
| 'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL', | |
| 'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows' | |
| }) | |
| checks_df = pl.DataFrame(checks) | |
| mo.ui.table(checks_df.to_pandas()) | |
| return (checks,) | |
| def _(checks, mo): | |
| # Overall status | |
| all_pass = all(c['Status'] == 'PASS' for c in checks) | |
| if all_pass: | |
| mo.md(""" | |
| ✅ **All validation checks PASSED** | |
| Data is ready for model training and inference! | |
| """) | |
| else: | |
| failed = [c['Check'] for c in checks if c['Status'] == 'FAIL'] | |
| warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING'] | |
| status = "⚠️ **Some checks failed or have warnings**\n\n" | |
| if failed: | |
| status += f"**Failed**: {', '.join(failed)}\n\n" | |
| if warnings: | |
| status += f"**Warnings**: {', '.join(warnings)}" | |
| mo.md(status) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## Next Steps | |
| ✅ **JAO Data Collection & Unification: COMPLETE** | |
| - 24 months of data (Oct 2023 - Oct 2025) | |
| - 17,544 hourly records | |
| - 726 features engineered | |
| **Remaining Work:** | |
| 1. Collect weather data (OpenMeteo, 52 grid points) | |
| 2. Collect ENTSO-E data (generation, flows, outages) | |
| 3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates) | |
| 4. Integrate all data sources | |
| 5. Begin zero-shot Chronos 2 inference | |
| --- | |
| **Data Files**: | |
| - `data/processed/unified_jao_24month.parquet` (5.59 MB) | |
| - `data/processed/cnec_hourly_24month.parquet` (4.57 MB) | |
| - `data/processed/features_jao_24month.parquet` (0.60 MB) | |
| """ | |
| ) | |
| return | |
| def _(mo, unified_jao): | |
| # Display the unified JAO dataset | |
| mo.md("## Unified JAO Dataset") | |
| mo.ui.table(unified_jao.to_pandas(), page_size=20) | |
| return | |
| def _(features_jao, mo, unified_jao): | |
| # Show the actual structure with timestamp | |
| mo.md("### Unified JAO Dataset Structure") | |
| display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10) | |
| mo.ui.table(display_df.to_pandas()) | |
| mo.md(f""" | |
| **Dataset Info:** | |
| - **Total columns**: {len(unified_jao.columns)} | |
| - **Timestamp column**: `mtu` (Market Time Unit) | |
| - **Date range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()} | |
| """) | |
| # Show the 726 features dataset separately | |
| mo.md("### Features Dataset (726 engineered features)") | |
| mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas()) | |
| return | |
| def _(features_jao, mo, pl, unified_jao): | |
| # Show actual column counts | |
| mo.md(f""" | |
| ### Dataset Column Counts | |
| **unified_jao**: {len(unified_jao.columns)} columns | |
| - Raw unified data (MaxBEX, LTA, NetPos) | |
| **features_jao**: {len(features_jao.columns)} columns | |
| - Engineered features (726 + timestamp) | |
| """) | |
| # Show all column categories in features dataset | |
| tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')] | |
| tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')] | |
| lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')] | |
| temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] | |
| target_cols = [c for c in features_jao.columns if c.startswith('target_')] | |
| feature_breakdown = pl.DataFrame({ | |
| 'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'], | |
| 'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)] | |
| }) | |
| mo.md("### Feature Breakdown in features_jao dataset:") | |
| mo.ui.table(feature_breakdown.to_pandas()) | |
| # Show first 20 actual column names from features_jao | |
| mo.md("### First 20 column names in features_jao:") | |
| for i, col in enumerate(features_jao.columns[:]): | |
| print(f"{i+1:3d}. {col}") | |
| return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols | |
| def _(features_jao, mo, pl): | |
| # Check CNEC Tier-1 binding values without redefining variables | |
| _cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')] | |
| if _cnec_t1_binding_cols: | |
| # Show sample of binding values | |
| _sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20) | |
| mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)") | |
| mo.ui.table(_sample_bindings.to_pandas(), page_size=10) | |
| # Check unique values in first binding column | |
| _first_col = _cnec_t1_binding_cols[0] | |
| _unique_vals = features_jao[_first_col].unique().sort() | |
| mo.md(f"### Unique Values in {_first_col}") | |
| print(f"Unique values: {_unique_vals.to_list()}") | |
| # Value counts for first column | |
| _val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True) | |
| mo.ui.table(_val_counts.to_pandas()) | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |