APILARGE / app.py
Madras1's picture
Update app.py
bf7c25b verified
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
# --- CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
print(f"βš™οΈ Setting up environment for {MODEL_ID}...")
# Global Variables (Cache)
model = None
tokenizer = None
# --- EXPLICIT DOWNLOAD FUNCTION ---
def download_model_first():
print("⏳ Starting preventive weight download (This will take time)...")
try:
# Downloads files to Space cache WITHOUT using GPU time
snapshot_download(repo_id=MODEL_ID)
print("βœ… Download complete! Files are cached.")
except Exception as e:
print(f"⚠️ Warning: Download failed or already exists. Error: {e}")
def load_model():
global model, tokenizer
if model is None:
print(f"πŸ”₯ Loading model into VRAM...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Loads the previously downloaded files
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16
)
print("βœ… Qwen 72B is ready!")
except Exception as e:
print(f"❌ Critical error loading the model: {e}")
raise e
return model, tokenizer
# --- GENERATION FUNCTION (ZEROGPU) ---
@spaces.GPU(duration=150)
def generate(message, history, system_prompt, temperature, max_tokens):
model, tokenizer = load_model()
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Manual history handling
for turn in history:
if turn[0]: messages.append({"role": "user", "content": turn[0]})
if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
messages.append({"role": "user", "content": message})
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
top_p=0.95,
top_k=40,
repetition_penalty=1.1
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
# --- INTERFACE ---
with gr.Blocks() as demo:
gr.Markdown(f"# Qwen 72B ZeroGPU Test")
# Aviso solicitado
gr.Markdown(
"""
### ⚠️ WARNING: Large Model Inference Test
**This model (Qwen 72B) is extremely large.**
* **Loading time:** There may be a massive delay during the first initialization.
* **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
"""
)
with gr.Accordion("βš™οΈ Settings", open=False):
sys_prompt = gr.Textbox(
label="System Prompt",
value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
lines=2
)
temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")
chat = gr.ChatInterface(
fn=generate,
additional_inputs=[sys_prompt, temp, tokens]
)
if __name__ == "__main__":
download_model_first()
demo.launch()