Javedalam commited on
Commit
fd3f5ee
·
verified ·
1 Parent(s): c878c9a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ import gradio as gr
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+
7
+ # --- Config ---
8
+ MODEL_ID = os.getenv("MODEL_ID", "WeiboAI/VibeThinker-1.5B")
9
+ SYSTEM_PROMPT = os.getenv(
10
+ "SYSTEM_PROMPT",
11
+ "You are a concise solver. Return a single short answer. Do not explain."
12
+ )
13
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
14
+ TOP_P = float(os.getenv("TOP_P", "0.9"))
15
+ MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "256"))
16
+
17
+ # --- Load ---
18
+ print(f"Loading model: {MODEL_ID}")
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
20
+ # Use CPU on ZeroGPU; float32 avoids CPU bf16 issues on some wheels
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MODEL_ID,
23
+ trust_remote_code=True,
24
+ low_cpu_mem_usage=True,
25
+ torch_dtype=torch.float32
26
+ ).to("cpu").eval()
27
+ print("Model loaded.")
28
+
29
+ def build_prompt(message, history):
30
+ """Use the model's chat template if available."""
31
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
32
+ if history:
33
+ for user_msg, assistant_msg in history:
34
+ if user_msg:
35
+ messages.append({"role": "user", "content": str(user_msg)})
36
+ if assistant_msg:
37
+ messages.append({"role": "assistant", "content": str(assistant_msg)})
38
+ messages.append({"role": "user", "content": str(message or '')})
39
+
40
+ try:
41
+ prompt = tokenizer.apply_chat_template(
42
+ messages, tokenize=False, add_generation_prompt=True
43
+ )
44
+ except Exception:
45
+ # Fallback (shouldn’t hit for Qwen-style models)
46
+ prompt = f"[SYSTEM]\n{SYSTEM_PROMPT}\n[USER]\n{message}\n[ASSISTANT]\n"
47
+ return prompt
48
+
49
+ def chat_fn(message, history):
50
+ """Streamed generation compatible with gr.ChatInterface (yields partials)."""
51
+ prompt = build_prompt(message, history)
52
+ inputs = tokenizer([prompt], return_tensors="pt")
53
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
54
+
55
+ streamer = TextIteratorStreamer(
56
+ tokenizer, skip_prompt=True, skip_special_tokens=True
57
+ )
58
+ gen_kwargs = dict(
59
+ **inputs,
60
+ streamer=streamer,
61
+ do_sample=True,
62
+ temperature=TEMPERATURE,
63
+ top_p=TOP_P,
64
+ max_new_tokens=MAX_NEW_TOKENS,
65
+ repetition_penalty=1.05,
66
+ eos_token_id=tokenizer.eos_token_id,
67
+ pad_token_id=tokenizer.eos_token_id,
68
+ )
69
+
70
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
71
+ thread.start()
72
+
73
+ partial = ""
74
+ for new_text in streamer:
75
+ partial += new_text
76
+ # Optional hard stop: if user wants one-liners, cut after first newline.
77
+ # idx = partial.find("\n")
78
+ # if idx != -1:
79
+ # yield partial[:idx].strip()
80
+ # return
81
+ yield partial.strip()
82
+
83
+ demo = gr.ChatInterface(
84
+ fn=chat_fn,
85
+ title="VibeThinker-1.5B Chat (CPU)",
86
+ description="WeiboAI/VibeThinker-1.5B • Simple streaming chat on CPU. "
87
+ "Set MODEL_ID/TEMPERATURE/TOP_P/MAX_NEW_TOKENS in Space Variables."
88
+ )
89
+
90
+ if __name__ == "__main__":
91
+ demo.queue().launch()