Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 7

Commit

f5a609d

1 Parent(s): 4f65341

Improve streaming with incremental JSON parsing and plan end token

- Add PLAN_END_TOKEN for explicit plan termination
- Enhance system prompt with detailed JSON structure rules
- Implement incremental JSON parsing during streaming
- Update default max_new_tokens to 16000
- Add API names for endpoints
- Improve validation feedback during streaming

Files changed (1) hide show

app.py +48 -16

app.py CHANGED Viewed

@@ -14,7 +14,9 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
-ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit ONLY strict JSON with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nEach route_plan entry must be a tool call (e.g., /math(...), /code(...), /general-search(...)).\nBe concise but precise. Do not include prose outside of the JSON object."""
 MODELS = {
     "Router-Qwen3-32B-8bit": {
@@ -237,23 +239,48 @@ def generate_router_plan_streaming(
         # Stream tokens
         completion = ""
         for new_text in streamer:
             completion += new_text
-            yield completion, {}, "🔄 Generating...", prompt
         # Final processing after streaming completes
         thread.join()
-        try:
-            json_block = extract_json_from_text(completion)
-            plan = json.loads(json_block)
-            ok, issues = validate_router_plan(plan)
-            validation_msg = format_validation_message(ok, issues)
-        except Exception as exc:
-            plan = {}
-            validation_msg = f"❌ JSON parsing failed: {exc}"
-        yield completion, plan, validation_msg, prompt
     except Exception as exc:
         error_msg = f"❌ Generation failed: {str(exc)}"
@@ -316,7 +343,7 @@ def build_ui():
                     placeholder="Comma-separated e.g. calculus, optimization, python",
                     value="calculus, optimization, python",
                 )
-                max_new_tokens = gr.Slider(256, 20000, value=640, step=32, label="Max New Tokens")
                 temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
                 top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
@@ -345,9 +372,14 @@ def build_ui():
             ],
             outputs=[raw_output, plan_json, validation_msg, prompt_view],
             show_progress="full",
         )
-        clear_btn.click(fn=clear_outputs, outputs=[raw_output, plan_json, validation_msg, prompt_view])
     return demo

 if not HF_TOKEN:
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
+PLAN_END_TOKEN = "<|end_of_plan|>"
+ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n  "route_plan": ["/general-search(...)"],\n  "route_rationale": "...",\n  ...\n}<|end_of_plan|>\nReturn nothing else."""
 MODELS = {
     "Router-Qwen3-32B-8bit": {
         # Stream tokens
         completion = ""
+        parsed_plan: Dict[str, Any] | None = None
+        validation_msg = "🔄 Generating..."
         for new_text in streamer:
             completion += new_text
+            chunk = completion
+            finished = False
+            if PLAN_END_TOKEN in chunk:
+                chunk = chunk.split(PLAN_END_TOKEN, 1)[0]
+                finished = True
+            try:
+                json_block = extract_json_from_text(chunk)
+                candidate_plan = json.loads(json_block)
+                ok, issues = validate_router_plan(candidate_plan)
+                validation_msg = format_validation_message(ok, issues)
+                parsed_plan = candidate_plan if ok else candidate_plan
+            except Exception:
+                # Ignore until JSON is complete
+                pass
+            yield chunk, parsed_plan or {}, validation_msg, prompt
+            if finished:
+                completion = chunk
+                break
         # Final processing after streaming completes
         thread.join()
+        completion = completion.strip()
+        if parsed_plan is None:
+            try:
+                json_block = extract_json_from_text(completion)
+                parsed_plan = json.loads(json_block)
+                ok, issues = validate_router_plan(parsed_plan)
+                validation_msg = format_validation_message(ok, issues)
+            except Exception as exc:
+                parsed_plan = {}
+                validation_msg = f"❌ JSON parsing failed: {exc}"
+        yield completion, parsed_plan, validation_msg, prompt
     except Exception as exc:
         error_msg = f"❌ Generation failed: {str(exc)}"
                     placeholder="Comma-separated e.g. calculus, optimization, python",
                     value="calculus, optimization, python",
                 )
+                max_new_tokens = gr.Slider(256, 20000, value=16000, step=32, label="Max New Tokens")
                 temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
                 top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
             ],
             outputs=[raw_output, plan_json, validation_msg, prompt_view],
             show_progress="full",
+            api_name="/generate_router_plan_streaming",
         )
+        clear_btn.click(
+            fn=clear_outputs,
+            outputs=[raw_output, plan_json, validation_msg, prompt_view],
+            api_name="/clear_outputs",
+        )
     return demo