Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 8

Commit

06b4cf5

1 Parent(s): cdac920

Migrate to AWQ quantization with FlashAttention-2

- Replace BitsAndBytes 8-bit with AWQ 4-bit quantization (primary path)
- Add FlashAttention-2 support for optimized attention
- Enable TF32 math for Ampere+ GPUs
- Add CUDA kernel warmup on startup to reduce first-token latency
- Update model names to reflect AWQ optimization
- Add graceful fallback chain: AWQ -> BitsAndBytes -> bf16/fp16/fp32
- Update requirements.txt with flash-attn>=2.5.0
- Update README with performance optimizations documentation

Files changed (4) hide show

README.md +1 -0
app.py +149 -24
requirements.txt +1 -0
test_api.py +107 -0

README.md CHANGED Viewed

@@ -62,3 +62,4 @@ python app.py
 - The UI enforces single-turn router generations; conversation history and web search are intentionally omitted to match the Milestone 6 deliverable.
 - If you need to re-enable web search or more checkpoints, extend `MODELS` and adjust the prompt builder accordingly.
 - **Benchmarking:** run `python Milestone-6/router-agent/tests/run_router_space_benchmark.py --space Alovestocode/ZeroGPU-LLM-Inference --limit 32` (requires `pip install gradio_client`) to call the Space, dump predictions, and evaluate against the Milestone 5 hard suite + thresholds.

 - The UI enforces single-turn router generations; conversation history and web search are intentionally omitted to match the Milestone 6 deliverable.
 - If you need to re-enable web search or more checkpoints, extend `MODELS` and adjust the prompt builder accordingly.
 - **Benchmarking:** run `python Milestone-6/router-agent/tests/run_router_space_benchmark.py --space Alovestocode/ZeroGPU-LLM-Inference --limit 32` (requires `pip install gradio_client`) to call the Space, dump predictions, and evaluate against the Milestone 5 hard suite + thresholds.
+- Set `ROUTER_PREFETCH_MODEL` (single value) or `ROUTER_PREFETCH_MODELS=Router-Qwen3-32B-8bit,Router-Gemma3-27B-8bit` (comma-separated, `ALL` for every checkpoint) to warm-load weights during startup. Disable background warming by setting `ROUTER_WARM_REMAINING=0`.

app.py CHANGED Viewed

@@ -8,9 +8,37 @@ from typing import Any, Dict, List, Tuple
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, pipeline
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
@@ -21,14 +49,14 @@ STOP_SEQUENCES = [PLAN_END_TOKEN, "</json>", "</JSON>"]
 ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n  "route_plan": ["/general-search(...)"],\n  "route_rationale": "...",\n  ...\n}<|end_of_plan|>\nReturn nothing else."""
 MODELS = {
-    "Router-Qwen3-32B-8bit": {
         "repo_id": "Alovestocode/router-qwen3-32b-merged",
-        "description": "Router checkpoint on Qwen3 32B merged and quantized for 8-bit ZeroGPU inference.",
         "params_b": 32.0,
     },
-    "Router-Gemma3-27B-8bit": {
         "repo_id": "Alovestocode/router-gemma3-merged",
-        "description": "Router checkpoint on Gemma3 27B merged and quantized for 8-bit ZeroGPU inference.",
         "params_b": 27.0,
     },
 }
@@ -56,7 +84,12 @@ def get_tokenizer(repo: str):
     tok = TOKENIZER_CACHE.get(repo)
     if tok is not None:
         return tok
-    tok = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
     tok.padding_side = "left"
     tok.truncation_side = "left"
     if tok.pad_token_id is None and tok.eos_token_id is not None:
@@ -65,6 +98,35 @@ def get_tokenizer(repo: str):
     return tok
 def load_pipeline(model_name: str):
     if model_name in PIPELINES:
         return PIPELINES[model_name]
@@ -72,27 +134,52 @@ def load_pipeline(model_name: str):
     repo = MODELS[model_name]["repo_id"]
     tokenizer = get_tokenizer(repo)
-    try:
-        quant_config = BitsAndBytesConfig(load_in_8bit=True)
-        pipe = pipeline(
-            task="text-generation",
-            model=repo,
-            tokenizer=tokenizer,
-            trust_remote_code=True,
-            device_map="auto",
-            model_kwargs={"quantization_config": quant_config},
-            use_cache=True,
-            token=HF_TOKEN,
-        )
-        pipe.model.eval()
-        PIPELINES[model_name] = pipe
-        _schedule_background_warm(model_name)
-        return pipe
-    except Exception as exc:
-        print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
             pipe = pipeline(
                 task="text-generation",
                 model=repo,
@@ -100,6 +187,7 @@ def load_pipeline(model_name: str):
                 trust_remote_code=True,
                 device_map="auto",
                 dtype=dtype,
                 use_cache=True,
                 token=HF_TOKEN,
             )
@@ -110,12 +198,18 @@ def load_pipeline(model_name: str):
         except Exception:
             continue
     pipe = pipeline(
         task="text-generation",
         model=repo,
         tokenizer=tokenizer,
         trust_remote_code=True,
         device_map="auto",
         use_cache=True,
         token=HF_TOKEN,
     )
@@ -125,6 +219,35 @@ def load_pipeline(model_name: str):
     return pipe
 def _schedule_background_warm(loaded_model: str) -> None:
     global WARMED_REMAINING
     if WARMED_REMAINING:
@@ -143,6 +266,8 @@ def _schedule_background_warm(loaded_model: str) -> None:
             try:
                 print(f"Background warm start for {name}")
                 load_pipeline(name)
             except Exception as exc:  # pragma: no cover
                 print(f"Warm start failed for {name}: {exc}")
         WARMED_REMAINING = True

 import gradio as gr
 import spaces
 import torch
+from transformers import AutoTokenizer, TextIteratorStreamer, pipeline
 from threading import Thread
+# Enable optimizations
+torch.backends.cuda.matmul.allow_tf32 = True
+# Try to import AWQ, fallback to BitsAndBytes if not available
+try:
+    from awq import AutoAWQForCausalLM
+    AWQ_AVAILABLE = True
+except ImportError:
+    AWQ_AVAILABLE = False
+    print("Warning: AutoAWQ not available, falling back to BitsAndBytes")
+# Always import BitsAndBytesConfig for fallback
+try:
+    from transformers import BitsAndBytesConfig
+    BITSANDBYTES_AVAILABLE = True
+except ImportError:
+    BITSANDBYTES_AVAILABLE = False
+    BitsAndBytesConfig = None
+    print("Warning: BitsAndBytes not available")
+# Try to import FlashAttention-2
+try:
+    import flash_attn
+    FLASH_ATTN_AVAILABLE = True
+except ImportError:
+    FLASH_ATTN_AVAILABLE = False
+    print("Warning: FlashAttention-2 not available")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
     raise RuntimeError("HF_TOKEN environment variable must be set for private router checkpoints.")
 ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and General-Search specialists.\nEmit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\nthinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics.\nRules:\n- No markdown/code fences, no natural-language prologues or epilogues.\n- route_plan must be an ordered list of tool invocations such as /math(...), /code(...), /general-search(...).\n- todo_list must map each checklist item to the responsible tool.\n- metrics must include primary and secondary arrays (add optional *_guidance fields when they exist).\n- After the closing brace of the JSON object, immediately append the sentinel <|end_of_plan|>.\nExample output:\n{\n  "route_plan": ["/general-search(...)"],\n  "route_rationale": "...",\n  ...\n}<|end_of_plan|>\nReturn nothing else."""
 MODELS = {
+    "Router-Qwen3-32B-AWQ": {
         "repo_id": "Alovestocode/router-qwen3-32b-merged",
+        "description": "Router checkpoint on Qwen3 32B merged, optimized with AWQ quantization and FlashAttention-2.",
         "params_b": 32.0,
     },
+    "Router-Gemma3-27B-AWQ": {
         "repo_id": "Alovestocode/router-gemma3-merged",
+        "description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization and FlashAttention-2.",
         "params_b": 27.0,
     },
 }
     tok = TOKENIZER_CACHE.get(repo)
     if tok is not None:
         return tok
+    tok = AutoTokenizer.from_pretrained(
+        repo,
+        token=HF_TOKEN,
+        use_fast=True,
+        trust_remote_code=True
+    )
     tok.padding_side = "left"
     tok.truncation_side = "left"
     if tok.pad_token_id is None and tok.eos_token_id is not None:
     return tok
+def load_awq_pipeline(repo: str, tokenizer):
+    """Load AWQ-quantized model with FlashAttention-2."""
+    model = AutoAWQForCausalLM.from_quantized(
+        repo,
+        fuse_layers=True,
+        trust_remote_code=True,
+        device_map="auto",
+        token=HF_TOKEN,
+    )
+    # Prepare model kwargs with FlashAttention-2 if available
+    model_kwargs = {}
+    if FLASH_ATTN_AVAILABLE:
+        model_kwargs["attn_implementation"] = "flash_attention_2"
+    pipe = pipeline(
+        task="text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        trust_remote_code=True,
+        device_map="auto",
+        model_kwargs=model_kwargs,
+        use_cache=True,
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.model.eval()
+    return pipe
 def load_pipeline(model_name: str):
     if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     tokenizer = get_tokenizer(repo)
+    # Try AWQ first if available
+    if AWQ_AVAILABLE:
+        try:
+            print(f"Loading {repo} with AWQ quantization...")
+            pipe = load_awq_pipeline(repo, tokenizer)
+            PIPELINES[model_name] = pipe
+            _schedule_background_warm(model_name)
+            # Warm kernels immediately after loading
+            Thread(target=lambda: _warm_kernels(model_name), daemon=True).start()
+            return pipe
+        except Exception as exc:
+            print(f"AWQ load failed for {repo}: {exc}. Falling back to BitsAndBytes.")
+    # Fallback to BitsAndBytes 8-bit
+    if BITSANDBYTES_AVAILABLE:
+        try:
+            quant_config = BitsAndBytesConfig(load_in_8bit=True)
+            model_kwargs = {"quantization_config": quant_config}
+            if FLASH_ATTN_AVAILABLE:
+                model_kwargs["attn_implementation"] = "flash_attention_2"
+            pipe = pipeline(
+                task="text-generation",
+                model=repo,
+                tokenizer=tokenizer,
+                trust_remote_code=True,
+                device_map="auto",
+                model_kwargs=model_kwargs,
+                use_cache=True,
+                token=HF_TOKEN,
+                torch_dtype=torch.bfloat16,
+            )
+            pipe.model.eval()
+            PIPELINES[model_name] = pipe
+            _schedule_background_warm(model_name)
+            return pipe
+        except Exception as exc:
+            print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
+    # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
+            model_kwargs = {}
+            if FLASH_ATTN_AVAILABLE:
+                model_kwargs["attn_implementation"] = "flash_attention_2"
             pipe = pipeline(
                 task="text-generation",
                 model=repo,
                 trust_remote_code=True,
                 device_map="auto",
                 dtype=dtype,
+                model_kwargs=model_kwargs,
                 use_cache=True,
                 token=HF_TOKEN,
             )
         except Exception:
             continue
+    # Final fallback
+    model_kwargs = {}
+    if FLASH_ATTN_AVAILABLE:
+        model_kwargs["attn_implementation"] = "flash_attention_2"
     pipe = pipeline(
         task="text-generation",
         model=repo,
         tokenizer=tokenizer,
         trust_remote_code=True,
         device_map="auto",
+        model_kwargs=model_kwargs,
         use_cache=True,
         token=HF_TOKEN,
     )
     return pipe
+def _warm_kernels(model_name: str) -> None:
+    """Warm up CUDA kernels with a small dummy generation."""
+    try:
+        pipe = PIPELINES.get(model_name)
+        if pipe is None:
+            return
+        tokenizer = pipe.tokenizer
+        # Create a minimal prompt for warmup
+        warmup_text = "test"
+        inputs = tokenizer(warmup_text, return_tensors="pt")
+        if hasattr(pipe.model, 'device'):
+            inputs = {k: v.to(pipe.model.device) for k, v in inputs.items()}
+        elif torch.cuda.is_available():
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        # Run a tiny generation to JIT-fuse kernels
+        with torch.inference_mode():
+            _ = pipe.model.generate(
+                **inputs,
+                max_new_tokens=2,
+                do_sample=False,
+                use_cache=True,
+            )
+        print(f"Kernels warmed for {model_name}")
+    except Exception as exc:
+        print(f"Kernel warmup failed for {model_name}: {exc}")
 def _schedule_background_warm(loaded_model: str) -> None:
     global WARMED_REMAINING
     if WARMED_REMAINING:
             try:
                 print(f"Background warm start for {name}")
                 load_pipeline(name)
+                # Warm kernels after loading
+                _warm_kernels(name)
             except Exception as exc:  # pragma: no cover
                 print(f"Warm start failed for {name}: {exc}")
         WARMED_REMAINING = True

requirements.txt CHANGED Viewed

@@ -8,6 +8,7 @@ spaces
 sentencepiece
 accelerate
 autoawq
 timm
 compressed-tensors
 bitsandbytes

 sentencepiece
 accelerate
 autoawq
+flash-attn>=2.5.0
 timm
 compressed-tensors
 bitsandbytes

test_api.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+"""
+Test script for ZeroGPU LLM Inference API
+Usage: python test_api.py
+"""
+import requests
+import json
+import sys
+API_URL = "https://Alovestocode-ZeroGPU-LLM-Inference.hf.space"
+def test_api():
+    """Test the API endpoint"""
+    print("=" * 60)
+    print("Testing ZeroGPU LLM Inference API")
+    print("=" * 60)
+    # Test 1: Check if space is accessible
+    print("\n1. Checking if space is accessible...")
+    try:
+        response = requests.get(API_URL, timeout=10)
+        if response.status_code == 200:
+            print("   ✅ Space is accessible")
+        else:
+            print(f"   ⚠️  Space returned status {response.status_code}")
+    except Exception as e:
+        print(f"   ❌ Error: {e}")
+        return False
+    # Test 2: Check API info
+    print("\n2. Checking API info...")
+    try:
+        response = requests.get(f"{API_URL}/api/info", timeout=10)
+        print(f"   Status: {response.status_code}")
+        if response.status_code == 200:
+            print("   ✅ API info endpoint accessible")
+    except Exception as e:
+        print(f"   ⚠️  Error: {e}")
+    # Test 3: Try the API endpoint
+    print("\n3. Testing API endpoint...")
+    payload = {
+        "data": [
+            "Solve a quadratic equation using Python",
+            "",
+            "- Provide step-by-step solution",
+            "",
+            "intermediate",
+            "math, python",
+            "Router-Qwen3-32B-8bit",
+            256,  # Small token count for quick test
+            0.2,
+            0.9
+        ],
+        "fn_index": 0
+    }
+    try:
+        print(f"   Sending request to {API_URL}/api/predict...")
+        response = requests.post(
+            f"{API_URL}/api/predict",
+            json=payload,
+            timeout=120  # Longer timeout for model loading
+        )
+        print(f"   Status Code: {response.status_code}")
+        if response.status_code == 200:
+            print("   ✅ API is working!")
+            result = response.json()
+            print(f"\n   Response structure:")
+            if isinstance(result, dict):
+                print(f"   Keys: {list(result.keys())}")
+                if "data" in result:
+                    print(f"   Data length: {len(result['data'])}")
+                    if len(result['data']) > 0:
+                        print(f"   First output preview: {str(result['data'][0])[:200]}")
+            else:
+                print(f"   Result: {str(result)[:300]}")
+            return True
+        else:
+            print(f"   ❌ API returned status {response.status_code}")
+            print(f"   Response: {response.text[:500]}")
+            return False
+    except requests.exceptions.Timeout:
+        print("   ⚠️  Request timed out (this might be normal for first request due to model loading)")
+        return False
+    except Exception as e:
+        print(f"   ❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_api()
+    print("\n" + "=" * 60)
+    if success:
+        print("✅ API test completed successfully!")
+    else:
+        print("⚠️  API test had issues. The space might still be building.")
+        print("   Wait a few minutes and try again, or check the space status at:")
+        print(f"   {API_URL}")
+    print("=" * 60)
+    sys.exit(0 if success else 1)