Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 8

Commit

fc0ab14

1 Parent(s): c454e43

Make GPU duration slider functional with dynamic wrapper creation

- Create GPU wrappers dynamically based on user's gpu_duration slider value
- Cache wrappers to avoid recreating them for the same duration
- Round duration to nearest 60 seconds for efficient caching
- Now the slider value actually controls GPU allocation duration

Files changed (1) hide show

app.py +24 -5

app.py CHANGED Viewed

@@ -387,7 +387,18 @@ def _generate_router_plan_streaming_internal(
         yield "", {}, error_msg, ""
-@spaces.GPU(duration=1800)  # Use maximum duration to allow user flexibility
 def generate_router_plan_streaming(
     user_task: str,
     context: str,
@@ -404,13 +415,21 @@ def generate_router_plan_streaming(
     """
     Generate router plan with streaming output.
-    Note: gpu_duration parameter is for user awareness. The actual GPU allocation
-    uses the decorator's duration (1800s max) to allow flexibility.
     """
-    yield from _generate_router_plan_streaming_internal(
         user_task, context, acceptance, extra_guidance,
         difficulty, tags, model_choice, max_new_tokens,
-        temperature, top_p, gpu_duration
     )

         yield "", {}, error_msg, ""
+def _create_gpu_wrapper(duration: int):
+    """Create a GPU-decorated wrapper function with specific duration."""
+    @spaces.GPU(duration=duration)
+    def wrapper(*args, **kwargs):
+        yield from _generate_router_plan_streaming_internal(*args, **kwargs)
+    return wrapper
+# Cache for GPU wrappers to avoid recreating them
+_gpu_wrapper_cache: Dict[int, Any] = {}
 def generate_router_plan_streaming(
     user_task: str,
     context: str,
     """
     Generate router plan with streaming output.
+    Uses user-specified gpu_duration to create a dynamically decorated function.
     """
+    # Round to nearest 60 seconds for caching efficiency
+    rounded_duration = ((gpu_duration + 30) // 60) * 60
+    rounded_duration = max(60, min(1800, rounded_duration))  # Clamp between 60 and 1800
+    # Get or create wrapper with this duration
+    if rounded_duration not in _gpu_wrapper_cache:
+        _gpu_wrapper_cache[rounded_duration] = _create_gpu_wrapper(rounded_duration)
+    wrapper = _gpu_wrapper_cache[rounded_duration]
+    yield from wrapper(
         user_task, context, acceptance, extra_guidance,
         difficulty, tags, model_choice, max_new_tokens,
+        temperature, top_p, rounded_duration
     )