Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

Evgueni Poloukarov Claude commited on Nov 15

Commit

2d135b5

1 Parent(s): dc9b9db

fix: implement sub-batching to avoid CUDA OOM on T4 GPU

Problem:
- Batch of 38 borders requires 762 MB GPU memory
- T4 GPU has only 534 MB free after model load (14.22 GB used)
- CUDA out of memory error

Solution:
- Process borders in sub-batches of 10 (4 sub-batches total)
- Clear GPU cache between sub-batches
- Still much faster than sequential (4x10 vs 38x1)

Implementation:
- Split contexts into sub-batches of SUB_BATCH_SIZE=10
- Process each sub-batch independently
- Store all forecasts and process quantiles after
- Expected time: ~8-10 seconds (vs 60 min sequential)

This balances GPU memory constraints with batch processing speedup.

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

src/forecasting/chronos_inference.py +113 -77

src/forecasting/chronos_inference.py CHANGED Viewed

@@ -159,10 +159,13 @@ class ChronosInferencePipeline:
         total_start = time.time()
-        # BATCH INFERENCE: Collect all contexts first
         print(f"\n[BATCH] Preparing contexts for {len(forecast_borders)} borders...")
-        batch_contexts = []
-        border_names = []
         for i, border in enumerate(forecast_borders, 1):
             print(f"  [{i}/{len(forecast_borders)}] Extracting context for {border}...", flush=True)
@@ -178,8 +181,8 @@ class ChronosInferencePipeline:
                 # Extract context values and convert to PyTorch tensor
                 context = torch.from_numpy(context_data[target_col].values).float()
-                batch_contexts.append(context)
-                border_names.append(border)
             except Exception as e:
                 import traceback
@@ -188,83 +191,116 @@ class ChronosInferencePipeline:
                 print(f"  [ERROR] {border}: {error_msg}", flush=True)
                 results['borders'][border] = {'error': error_msg, 'traceback': traceback_str}
-        # Stack all contexts into a batch
-        if batch_contexts:
-            batch_tensor = torch.stack(batch_contexts)  # Shape: (num_borders, context_hours)
-            print(f"\n[BATCH] Running inference on batch of {batch_tensor.shape[0]} borders...")
-            print(f"[BATCH] Batch shape: {batch_tensor.shape}", flush=True)
-            inference_start = time.time()
-            # Run batch inference
-            batch_forecasts = pipeline.predict(
-                inputs=batch_tensor,  # Chronos API uses 'inputs'
-                prediction_length=prediction_hours,
-                num_samples=num_samples
-            )
-            inference_time = time.time() - inference_start
-            print(f"[BATCH] Inference complete in {inference_time:.1f}s ({inference_time/len(border_names):.2f}s per border)")
-            print(f"[BATCH] Forecast shape: {batch_forecasts.shape}", flush=True)
             # Process each border's forecast
-            for i, border in enumerate(border_names):
-                print(f"\n[{i+1}/{len(border_names)}] Processing forecast for {border}...", flush=True)
-                border_start = time.time()
-                try:
-                    # Extract this border's forecast from batch
-                    forecast = batch_forecasts[i]  # Extract from batch dimension
-                    # Calculate quantiles
-                    forecast_numpy = forecast.numpy()
-                    print(f"[DEBUG] Raw forecast shape: {forecast_numpy.shape}", flush=True)
-                    # Chronos may return (batch, num_samples, time) or (num_samples, time)
-                    # Squeeze any batch dimension (if present)
-                    if forecast_numpy.ndim == 3:
-                        print(f"[DEBUG] 3D forecast detected, squeezing batch dimension", flush=True)
-                        forecast_numpy = forecast_numpy.squeeze(axis=0)  # Remove batch dim
-                    print(f"[DEBUG] Forecast shape after squeeze: {forecast_numpy.shape}, Expected: ({num_samples}, {prediction_hours}) or ({prediction_hours}, {num_samples})", flush=True)
-                    # Now forecast should be 2D: either (num_samples, time) or (time, num_samples)
-                    # Compute median along samples axis to get (time,) shape
-                    if forecast_numpy.shape[0] == num_samples and forecast_numpy.shape[1] == prediction_hours:
-                        # Shape is (num_samples, time) - use axis=0
-                        print(f"[DEBUG] Using axis=0 for shape (num_samples={num_samples}, time={prediction_hours})", flush=True)
-                        median = np.median(forecast_numpy, axis=0)
-                        q10 = np.quantile(forecast_numpy, 0.1, axis=0)
-                        q90 = np.quantile(forecast_numpy, 0.9, axis=0)
-                    elif forecast_numpy.shape[0] == prediction_hours and forecast_numpy.shape[1] == num_samples:
-                        # Shape is (time, num_samples) - use axis=1
-                        print(f"[DEBUG] Using axis=1 for shape (time={prediction_hours}, num_samples={num_samples})", flush=True)
-                        median = np.median(forecast_numpy, axis=1)
-                        q10 = np.quantile(forecast_numpy, 0.1, axis=1)
-                        q90 = np.quantile(forecast_numpy, 0.9, axis=1)
-                    else:
-                        raise ValueError(f"Unexpected forecast shape: {forecast_numpy.shape}, expected ({num_samples}, {prediction_hours}) or ({prediction_hours}, {num_samples})")
-                    print(f"[DEBUG] Final median shape: {median.shape}, Expected: ({prediction_hours},)", flush=True)
-                    assert median.shape == (prediction_hours,), f"Median shape {median.shape} != expected ({prediction_hours},)"
-                    # Store results
-                    results['borders'][border] = {
-                        'median': median.tolist(),
-                        'q10': q10.tolist(),
-                        'q90': q90.tolist(),
-                        'inference_time_s': time.time() - border_start
-                    }
-                    print(f"  [OK] Complete in {time.time() - border_start:.1f}s")
-                except Exception as e:
-                    import traceback
-                    error_msg = f"{type(e).__name__}: {str(e)}"
-                    traceback_str = traceback.format_exc()
-                    print(f"  [ERROR] {error_msg}", flush=True)
-                    print(f"Traceback:\n{traceback_str}", flush=True)
-                    results['borders'][border] = {'error': error_msg, 'traceback': traceback_str}
         # Add summary metadata
         results['metadata']['total_time_s'] = time.time() - total_start

         total_start = time.time()
+        # SUB-BATCH INFERENCE: Process borders in chunks to fit GPU memory
+        # T4 GPU has 14.74 GB total, model uses ~14 GB, so we need small batches
+        SUB_BATCH_SIZE = 10  # Process 10 borders at a time
         print(f"\n[BATCH] Preparing contexts for {len(forecast_borders)} borders...")
+        all_contexts = []
+        all_border_names = []
         for i, border in enumerate(forecast_borders, 1):
             print(f"  [{i}/{len(forecast_borders)}] Extracting context for {border}...", flush=True)
                 # Extract context values and convert to PyTorch tensor
                 context = torch.from_numpy(context_data[target_col].values).float()
+                all_contexts.append(context)
+                all_border_names.append(border)
             except Exception as e:
                 import traceback
                 print(f"  [ERROR] {border}: {error_msg}", flush=True)
                 results['borders'][border] = {'error': error_msg, 'traceback': traceback_str}
+        # Process contexts in sub-batches
+        if all_contexts:
+            num_contexts = len(all_contexts)
+            num_sub_batches = (num_contexts + SUB_BATCH_SIZE - 1) // SUB_BATCH_SIZE
+            print(f"\n[BATCH] Running inference in {num_sub_batches} sub-batches of {SUB_BATCH_SIZE} borders...")
+            all_forecasts = []
+            total_inference_time = 0
+            for batch_idx in range(num_sub_batches):
+                start_idx = batch_idx * SUB_BATCH_SIZE
+                end_idx = min(start_idx + SUB_BATCH_SIZE, num_contexts)
+                # Get sub-batch
+                sub_batch_contexts = all_contexts[start_idx:end_idx]
+                sub_batch_names = all_border_names[start_idx:end_idx]
+                batch_tensor = torch.stack(sub_batch_contexts)
+                print(f"[BATCH {batch_idx+1}/{num_sub_batches}] Processing {len(sub_batch_names)} borders: {sub_batch_names[0]} ... {sub_batch_names[-1]}", flush=True)
+                print(f"[BATCH {batch_idx+1}/{num_sub_batches}] Batch shape: {batch_tensor.shape}", flush=True)
+                inference_start = time.time()
+                # Run batch inference
+                batch_forecasts = pipeline.predict(
+                    inputs=batch_tensor,
+                    prediction_length=prediction_hours,
+                    num_samples=num_samples
+                )
+                inference_time = time.time() - inference_start
+                total_inference_time += inference_time
+                print(f"[BATCH {batch_idx+1}/{num_sub_batches}] Complete in {inference_time:.1f}s ({inference_time/len(sub_batch_names):.2f}s per border)", flush=True)
+                # Store forecasts
+                all_forecasts.append(batch_forecasts)
+                # Clear GPU cache between sub-batches
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            print(f"\n[BATCH] All inference complete in {total_inference_time:.1f}s total")
+            print(f"[BATCH] Average: {total_inference_time/num_contexts:.2f}s per border")
             # Process each border's forecast
+            forecast_idx = 0
+            for batch_idx, batch_forecasts in enumerate(all_forecasts):
+                start_idx = batch_idx * SUB_BATCH_SIZE
+                end_idx = min(start_idx + SUB_BATCH_SIZE, num_contexts)
+                sub_batch_names = all_border_names[start_idx:end_idx]
+                for i, border in enumerate(sub_batch_names):
+                    forecast_idx += 1
+                    print(f"\n[{forecast_idx}/{num_contexts}] Processing forecast for {border}...", flush=True)
+                    border_start = time.time()
+                    try:
+                        # Extract this border's forecast from batch
+                        forecast = batch_forecasts[i]  # Extract from batch dimension
+                        # Calculate quantiles
+                        forecast_numpy = forecast.numpy()
+                        print(f"[DEBUG] Raw forecast shape: {forecast_numpy.shape}", flush=True)
+                        # Chronos may return (batch, num_samples, time) or (num_samples, time)
+                        # Squeeze any batch dimension (if present)
+                        if forecast_numpy.ndim == 3:
+                            print(f"[DEBUG] 3D forecast detected, squeezing batch dimension", flush=True)
+                            forecast_numpy = forecast_numpy.squeeze(axis=0)  # Remove batch dim
+                        print(f"[DEBUG] Forecast shape after squeeze: {forecast_numpy.shape}, Expected: ({num_samples}, {prediction_hours}) or ({prediction_hours}, {num_samples})", flush=True)
+                        # Now forecast should be 2D: either (num_samples, time) or (time, num_samples)
+                        # Compute median along samples axis to get (time,) shape
+                        if forecast_numpy.shape[0] == num_samples and forecast_numpy.shape[1] == prediction_hours:
+                            # Shape is (num_samples, time) - use axis=0
+                            print(f"[DEBUG] Using axis=0 for shape (num_samples={num_samples}, time={prediction_hours})", flush=True)
+                            median = np.median(forecast_numpy, axis=0)
+                            q10 = np.quantile(forecast_numpy, 0.1, axis=0)
+                            q90 = np.quantile(forecast_numpy, 0.9, axis=0)
+                        elif forecast_numpy.shape[0] == prediction_hours and forecast_numpy.shape[1] == num_samples:
+                            # Shape is (time, num_samples) - use axis=1
+                            print(f"[DEBUG] Using axis=1 for shape (time={prediction_hours}, num_samples={num_samples})", flush=True)
+                            median = np.median(forecast_numpy, axis=1)
+                            q10 = np.quantile(forecast_numpy, 0.1, axis=1)
+                            q90 = np.quantile(forecast_numpy, 0.9, axis=1)
+                        else:
+                            raise ValueError(f"Unexpected forecast shape: {forecast_numpy.shape}, expected ({num_samples}, {prediction_hours}) or ({prediction_hours}, {num_samples})")
+                        print(f"[DEBUG] Final median shape: {median.shape}, Expected: ({prediction_hours},)", flush=True)
+                        assert median.shape == (prediction_hours,), f"Median shape {median.shape} != expected ({prediction_hours},)"
+                        # Store results
+                        results['borders'][border] = {
+                            'median': median.tolist(),
+                            'q10': q10.tolist(),
+                            'q90': q90.tolist(),
+                            'inference_time_s': time.time() - border_start
+                        }
+                        print(f"  [OK] Complete in {time.time() - border_start:.1f}s")
+                    except Exception as e:
+                        import traceback
+                        error_msg = f"{type(e).__name__}: {str(e)}"
+                        traceback_str = traceback.format_exc()
+                        print(f"  [ERROR] {error_msg}", flush=True)
+                        print(f"Traceback:\n{traceback_str}", flush=True)
+                        results['borders'][border] = {'error': error_msg, 'traceback': traceback_str}
         # Add summary metadata
         results['metadata']['total_time_s'] = time.time() - total_start