Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

App Files Files Community

Alikestocode commited on Nov 8

Commit

1b16b00

1 Parent(s): 9592189

Enable API in Gradio launch configuration

Browse files

Files changed (1) hide show

app.py +141 -13

app.py CHANGED Viewed

@@ -2,12 +2,13 @@ from __future__ import annotations
 import json
 import os
 from typing import Any, Dict, List, Tuple
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoTokenizer, pipeline, BitsAndBytesConfig, TextIteratorStreamer
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -15,6 +16,7 @@ if not HF_TOKEN:
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
 PLAN_END_TOKEN = "<|end_of_plan|>"
 ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n  "route_plan": ["/general-search(...)"],\n  "route_rationale": "...",\n  ...\n}<|end_of_plan|>\nReturn nothing else."""
@@ -45,6 +47,22 @@ REQUIRED_KEYS = [
 ]
 PIPELINES: Dict[str, Any] = {}
 def load_pipeline(model_name: str):
@@ -52,21 +70,23 @@ def load_pipeline(model_name: str):
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
-    tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
     try:
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
         pipe = pipeline(
             task="text-generation",
             model=repo,
             tokenizer=tokenizer,
             trust_remote_code=True,
             device_map="auto",
-            model_kwargs={"quantization_config": quantization_config},
             use_cache=True,
             token=HF_TOKEN,
         )
         PIPELINES[model_name] = pipe
         return pipe
     except Exception as exc:
         print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
@@ -83,7 +103,9 @@ def load_pipeline(model_name: str):
                 use_cache=True,
                 token=HF_TOKEN,
             )
             PIPELINES[model_name] = pipe
             return pipe
         except Exception:
             continue
@@ -97,10 +119,37 @@ def load_pipeline(model_name: str):
         use_cache=True,
         token=HF_TOKEN,
     )
     PIPELINES[model_name] = pipe
     return pipe
 def build_router_prompt(
     user_task: str,
     context: str,
@@ -152,20 +201,52 @@ def extract_json_from_text(text: str) -> str:
     raise ValueError("Router output JSON appears truncated.")
 def validate_router_plan(plan: Dict[str, Any]) -> Tuple[bool, List[str]]:
     issues: List[str] = []
     for key in REQUIRED_KEYS:
         if key not in plan:
             issues.append(f"Missing key: {key}")
     route_plan = plan.get("route_plan")
     if not isinstance(route_plan, list) or not route_plan:
         issues.append("route_plan must be a non-empty list of tool calls")
     metrics = plan.get("metrics")
     if not isinstance(metrics, dict):
         issues.append("metrics must be an object containing primary/secondary entries")
     todo = plan.get("todo_list")
     if not isinstance(todo, list) or not todo:
         issues.append("todo_list must contain at least one checklist item")
     return len(issues) == 0, issues
@@ -232,9 +313,15 @@ def generate_router_plan_streaming(
             "top_p": top_p,
             "do_sample": True,
             "streamer": streamer,
         }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         # Stream tokens
@@ -246,21 +333,22 @@ def generate_router_plan_streaming(
             completion += new_text
             chunk = completion
             finished = False
-            if PLAN_END_TOKEN in chunk:
-                chunk = chunk.split(PLAN_END_TOKEN, 1)[0]
-                finished = True
             try:
                 json_block = extract_json_from_text(chunk)
                 candidate_plan = json.loads(json_block)
                 ok, issues = validate_router_plan(candidate_plan)
                 validation_msg = format_validation_message(ok, issues)
-                parsed_plan = candidate_plan if ok else candidate_plan
             except Exception:
                 # Ignore until JSON is complete
                 pass
-            yield chunk, parsed_plan or {}, validation_msg, prompt
             if finished:
                 completion = chunk
@@ -269,7 +357,7 @@ def generate_router_plan_streaming(
         # Final processing after streaming completes
         thread.join()
-        completion = completion.strip()
         if parsed_plan is None:
             try:
                 json_block = extract_json_from_text(completion)
@@ -384,7 +472,47 @@ def build_ui():
     return demo
 demo = build_ui()
 if __name__ == "__main__":  # pragma: no cover
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import json
 import os
+import re
 from typing import Any, Dict, List, Tuple
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, pipeline
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN")
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
 PLAN_END_TOKEN = "<|end_of_plan|>"
+STOP_SEQUENCES = [PLAN_END_TOKEN, "</json>", "</JSON>"]
 ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n  "route_plan": ["/general-search(...)"],\n  "route_rationale": "...",\n  ...\n}<|end_of_plan|>\nReturn nothing else."""
 ]
 PIPELINES: Dict[str, Any] = {}
+TOKENIZER_CACHE: Dict[str, Any] = {}
+WARMED_REMAINING = False
+TOOL_PATTERN = re.compile(r"^/[a-z0-9_-]+\(.*\)$", re.IGNORECASE)
+def get_tokenizer(repo: str):
+    tok = TOKENIZER_CACHE.get(repo)
+    if tok is not None:
+        return tok
+    tok = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
+    tok.padding_side = "left"
+    tok.truncation_side = "left"
+    if tok.pad_token_id is None and tok.eos_token_id is not None:
+        tok.pad_token_id = tok.eos_token_id
+    TOKENIZER_CACHE[repo] = tok
+    return tok
 def load_pipeline(model_name: str):
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
+    tokenizer = get_tokenizer(repo)
     try:
+        quant_config = BitsAndBytesConfig(load_in_8bit=True)
         pipe = pipeline(
             task="text-generation",
             model=repo,
             tokenizer=tokenizer,
             trust_remote_code=True,
             device_map="auto",
+            model_kwargs={"quantization_config": quant_config},
             use_cache=True,
             token=HF_TOKEN,
         )
+        pipe.model.eval()
         PIPELINES[model_name] = pipe
+        _schedule_background_warm(model_name)
         return pipe
     except Exception as exc:
         print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
                 use_cache=True,
                 token=HF_TOKEN,
             )
+            pipe.model.eval()
             PIPELINES[model_name] = pipe
+            _schedule_background_warm(model_name)
             return pipe
         except Exception:
             continue
         use_cache=True,
         token=HF_TOKEN,
     )
+    pipe.model.eval()
     PIPELINES[model_name] = pipe
+    _schedule_background_warm(model_name)
     return pipe
+def _schedule_background_warm(loaded_model: str) -> None:
+    global WARMED_REMAINING
+    if WARMED_REMAINING:
+        return
+    warm_remaining = os.environ.get("ROUTER_WARM_REMAINING", "1")
+    if warm_remaining not in {"1", "true", "True"}:
+        return
+    remaining = [name for name in MODELS if name not in PIPELINES]
+    if not remaining:
+        WARMED_REMAINING = True
+        return
+    def _warm_all():
+        for name in remaining:
+            try:
+                print(f"Background warm start for {name}")
+                load_pipeline(name)
+            except Exception as exc:  # pragma: no cover
+                print(f"Warm start failed for {name}: {exc}")
+        WARMED_REMAINING = True
+    Thread(target=_warm_all, daemon=True).start()
 def build_router_prompt(
     user_task: str,
     context: str,
     raise ValueError("Router output JSON appears truncated.")
+def is_function_call(text: str) -> bool:
+    return bool(TOOL_PATTERN.match(text.strip()))
 def validate_router_plan(plan: Dict[str, Any]) -> Tuple[bool, List[str]]:
     issues: List[str] = []
     for key in REQUIRED_KEYS:
         if key not in plan:
             issues.append(f"Missing key: {key}")
     route_plan = plan.get("route_plan")
+    if isinstance(route_plan, str) and is_function_call(route_plan):
+        plan["route_plan"] = [route_plan]
+        route_plan = plan["route_plan"]
     if not isinstance(route_plan, list) or not route_plan:
         issues.append("route_plan must be a non-empty list of tool calls")
+    else:
+        cleaned: List[str] = []
+        for entry in route_plan:
+            if isinstance(entry, str) and is_function_call(entry.strip().strip("'\"")):
+                cleaned.append(entry.strip().strip("'\""))
+            else:
+                issues.append(f"route_plan entry is not a tool call: {entry}")
+        if cleaned:
+            plan["route_plan"] = cleaned
     metrics = plan.get("metrics")
     if not isinstance(metrics, dict):
         issues.append("metrics must be an object containing primary/secondary entries")
     todo = plan.get("todo_list")
     if not isinstance(todo, list) or not todo:
         issues.append("todo_list must contain at least one checklist item")
+    else:
+        cleaned_todo: List[str] = []
+        for entry in todo:
+            if isinstance(entry, str):
+                text = entry.strip()
+                if not text.startswith("- ["):
+                    text = text.lstrip("- ")
+                    text = f"- [ ] {text}"
+                cleaned_todo.append(text)
+            else:
+                issues.append("todo_list entry must be a string")
+        if cleaned_todo:
+            plan["todo_list"] = cleaned_todo
     return len(issues) == 0, issues
             "top_p": top_p,
             "do_sample": True,
             "streamer": streamer,
+            "eos_token_id": tokenizer.eos_token_id,
+            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
         }
+        def _generate():
+            with torch.inference_mode():
+                model.generate(**generation_kwargs)
+        thread = Thread(target=_generate)
         thread.start()
         # Stream tokens
             completion += new_text
             chunk = completion
             finished = False
+            display_plan = parsed_plan or {}
+            chunk, finished = trim_at_stop_sequences(chunk)
             try:
                 json_block = extract_json_from_text(chunk)
                 candidate_plan = json.loads(json_block)
                 ok, issues = validate_router_plan(candidate_plan)
                 validation_msg = format_validation_message(ok, issues)
+                parsed_plan = candidate_plan if ok else parsed_plan
+                display_plan = candidate_plan
             except Exception:
                 # Ignore until JSON is complete
                 pass
+            yield chunk, display_plan, validation_msg, prompt
             if finished:
                 completion = chunk
         # Final processing after streaming completes
         thread.join()
+        completion = trim_at_stop_sequences(completion.strip())[0]
         if parsed_plan is None:
             try:
                 json_block = extract_json_from_text(completion)
     return demo
+def _prefetch_from_env() -> None:
+    entries = os.environ.get("ROUTER_PREFETCH_MODELS")
+    if entries:
+        names = [item.strip() for item in entries.split(",") if item.strip()]
+    else:
+        single = os.environ.get("ROUTER_PREFETCH_MODEL")
+        names = [single] if single else []
+    if names == ["ALL"] or names == ["all"]:
+        names = list(MODELS.keys())
+    for name in names:
+        if name not in MODELS:
+            print(f"Prefetch skipped, unknown model: {name}")
+            continue
+        try:
+            load_pipeline(name)
+            print(f"Prefetched router model: {name}")
+        except Exception as exc:  # pragma: no cover
+            print(f"Prefetch failed for {name}: {exc}")
+_prefetch_from_env()
 demo = build_ui()
 if __name__ == "__main__":  # pragma: no cover
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", 7860)),
+        show_api=True,
+        api_name="/generate_router_plan_streaming"
+    )
+def trim_at_stop_sequences(text: str) -> Tuple[str, bool]:
+    earliest = None
+    for stop in STOP_SEQUENCES:
+        idx = text.find(stop)
+        if idx != -1 and (earliest is None or idx < earliest):
+            earliest = idx
+    if earliest is not None:
+        return text[:earliest], True
+    return text, False