Spaces:

Javedalam
/

my-fresh-gen

Running on Zero

File size: 2,765 Bytes

8fca131
 
bb22ffd
8fca131
 
44ab7a1
8fca131
 
 
32d32b1
44ab7a1
 
 
 
 
 
 
 
8fca131
 
44ab7a1
 
8fca131
44ab7a1
964b992
 
 
 
8fca131
44ab7a1
32d32b1
8fca131
44ab7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fca131
44ab7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
8fca131
32d32b1
44ab7a1
964b992
44ab7a1
 
 
 
964b992
 
8fca131
 
44ab7a1

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces

# Model config
MODEL_ID = "WeiboAI/VibeThinker-1.5B"
SYSTEM_PROMPT = "You are a concise solver. Respond briefly."

# Load model
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
)
print("Model loaded!")

@spaces.GPU
def chat_with_stream(message, history, progress=gr.Progress()):
    """Chat with streaming output"""
    
    # Handle inputs safely
    if message is None:
        message = "Hello"
    if history is None:
        history = []
    
    # Convert to string
    message = str(message)
    
    progress(0.1, desc="Building conversation...")
    
    # Build messages
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    
    # Add history
    for user_msg, assistant_msg in history:
        if user_msg is not None:
            messages.append({"role": "user", "content": str(user_msg)})
        if assistant_msg is not None:
            messages.append({"role": "assistant", "content": str(assistant_msg)})
    
    progress(0.3, desc="Adding your message...")
    messages.append({"role": "user", "content": message})
    
    progress(0.5, desc="Formatting input...")
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    progress(0.6, desc="Tokenizing...")
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    
    progress(0.7, desc="Starting generation...")
    
    # Generate with streaming
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=False,
        )
    
    progress(0.9, desc="Decoding response...")
    
    # Decode
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant response
    if "assistant" in full_text:
        response = full_text.split("assistant")[-1].strip()
    else:
        response = full_text
    
    progress(1.0, desc="Complete!")
    return response

def create_demo():
    """Create simple demo"""
    demo = gr.ChatInterface(
        fn=chat_with_stream,
        title="VibeThinker Chat",
        description="Simple chat with VibeThinker-1.5B",
        examples=["2+2", "What is AI?", "Write a poem"]
    )
    return demo

if __name__ == "__main__":
    print("Starting...")
    demo = create_demo()
    demo.launch(share=False)