Spaces:
Sleeping
Sleeping
Commit
Β·
f5a609d
1
Parent(s):
4f65341
Improve streaming with incremental JSON parsing and plan end token
Browse files- Add PLAN_END_TOKEN for explicit plan termination
- Enhance system prompt with detailed JSON structure rules
- Implement incremental JSON parsing during streaming
- Update default max_new_tokens to 16000
- Add API names for endpoints
- Improve validation feedback during streaming
app.py
CHANGED
|
@@ -14,7 +14,9 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
| 14 |
if not HF_TOKEN:
|
| 15 |
raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
MODELS = {
|
| 20 |
"Router-Qwen3-32B-8bit": {
|
|
@@ -237,23 +239,48 @@ def generate_router_plan_streaming(
|
|
| 237 |
|
| 238 |
# Stream tokens
|
| 239 |
completion = ""
|
|
|
|
|
|
|
|
|
|
| 240 |
for new_text in streamer:
|
| 241 |
completion += new_text
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
# Final processing after streaming completes
|
| 245 |
thread.join()
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
|
| 258 |
except Exception as exc:
|
| 259 |
error_msg = f"β Generation failed: {str(exc)}"
|
|
@@ -316,7 +343,7 @@ def build_ui():
|
|
| 316 |
placeholder="Comma-separated e.g. calculus, optimization, python",
|
| 317 |
value="calculus, optimization, python",
|
| 318 |
)
|
| 319 |
-
max_new_tokens = gr.Slider(256, 20000, value=
|
| 320 |
temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
|
| 321 |
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
|
| 322 |
|
|
@@ -345,9 +372,14 @@ def build_ui():
|
|
| 345 |
],
|
| 346 |
outputs=[raw_output, plan_json, validation_msg, prompt_view],
|
| 347 |
show_progress="full",
|
|
|
|
| 348 |
)
|
| 349 |
|
| 350 |
-
clear_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
return demo
|
| 353 |
|
|
|
|
| 14 |
if not HF_TOKEN:
|
| 15 |
raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
|
| 16 |
|
| 17 |
+
PLAN_END_TOKEN = "<|end_of_plan|>"
|
| 18 |
+
|
| 19 |
+
ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n "route_plan": ["/general-search(...)"],\n "route_rationale": "...",\n ...\n}<|end_of_plan|>\nReturn nothing else."""
|
| 20 |
|
| 21 |
MODELS = {
|
| 22 |
"Router-Qwen3-32B-8bit": {
|
|
|
|
| 239 |
|
| 240 |
# Stream tokens
|
| 241 |
completion = ""
|
| 242 |
+
parsed_plan: Dict[str, Any] | None = None
|
| 243 |
+
validation_msg = "π Generating..."
|
| 244 |
+
|
| 245 |
for new_text in streamer:
|
| 246 |
completion += new_text
|
| 247 |
+
chunk = completion
|
| 248 |
+
finished = False
|
| 249 |
+
if PLAN_END_TOKEN in chunk:
|
| 250 |
+
chunk = chunk.split(PLAN_END_TOKEN, 1)[0]
|
| 251 |
+
finished = True
|
| 252 |
+
|
| 253 |
+
try:
|
| 254 |
+
json_block = extract_json_from_text(chunk)
|
| 255 |
+
candidate_plan = json.loads(json_block)
|
| 256 |
+
ok, issues = validate_router_plan(candidate_plan)
|
| 257 |
+
validation_msg = format_validation_message(ok, issues)
|
| 258 |
+
parsed_plan = candidate_plan if ok else candidate_plan
|
| 259 |
+
except Exception:
|
| 260 |
+
# Ignore until JSON is complete
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
yield chunk, parsed_plan or {}, validation_msg, prompt
|
| 264 |
+
|
| 265 |
+
if finished:
|
| 266 |
+
completion = chunk
|
| 267 |
+
break
|
| 268 |
+
|
| 269 |
# Final processing after streaming completes
|
| 270 |
thread.join()
|
| 271 |
+
|
| 272 |
+
completion = completion.strip()
|
| 273 |
+
if parsed_plan is None:
|
| 274 |
+
try:
|
| 275 |
+
json_block = extract_json_from_text(completion)
|
| 276 |
+
parsed_plan = json.loads(json_block)
|
| 277 |
+
ok, issues = validate_router_plan(parsed_plan)
|
| 278 |
+
validation_msg = format_validation_message(ok, issues)
|
| 279 |
+
except Exception as exc:
|
| 280 |
+
parsed_plan = {}
|
| 281 |
+
validation_msg = f"β JSON parsing failed: {exc}"
|
| 282 |
+
|
| 283 |
+
yield completion, parsed_plan, validation_msg, prompt
|
| 284 |
|
| 285 |
except Exception as exc:
|
| 286 |
error_msg = f"β Generation failed: {str(exc)}"
|
|
|
|
| 343 |
placeholder="Comma-separated e.g. calculus, optimization, python",
|
| 344 |
value="calculus, optimization, python",
|
| 345 |
)
|
| 346 |
+
max_new_tokens = gr.Slider(256, 20000, value=16000, step=32, label="Max New Tokens")
|
| 347 |
temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
|
| 348 |
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
|
| 349 |
|
|
|
|
| 372 |
],
|
| 373 |
outputs=[raw_output, plan_json, validation_msg, prompt_view],
|
| 374 |
show_progress="full",
|
| 375 |
+
api_name="/generate_router_plan_streaming",
|
| 376 |
)
|
| 377 |
|
| 378 |
+
clear_btn.click(
|
| 379 |
+
fn=clear_outputs,
|
| 380 |
+
outputs=[raw_output, plan_json, validation_msg, prompt_view],
|
| 381 |
+
api_name="/clear_outputs",
|
| 382 |
+
)
|
| 383 |
|
| 384 |
return demo
|
| 385 |
|