import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import snapshot_download # --- CONFIGURATION --- MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4" print(f"⚙️ Setting up environment for {MODEL_ID}...") # Global Variables (Cache) model = None tokenizer = None # --- EXPLICIT DOWNLOAD FUNCTION --- def download_model_first(): print("⏳ Starting preventive weight download (This will take time)...") try: # Downloads files to Space cache WITHOUT using GPU time snapshot_download(repo_id=MODEL_ID) print("✅ Download complete! Files are cached.") except Exception as e: print(f"⚠️ Warning: Download failed or already exists. Error: {e}") def load_model(): global model, tokenizer if model is None: print(f"🔥 Loading model into VRAM...") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Loads the previously downloaded files model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16 ) print("✅ Qwen 72B is ready!") except Exception as e: print(f"❌ Critical error loading the model: {e}") raise e return model, tokenizer # --- GENERATION FUNCTION (ZEROGPU) --- @spaces.GPU(duration=150) def generate(message, history, system_prompt, temperature, max_tokens): model, tokenizer = load_model() messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) # Manual history handling for turn in history: if turn[0]: messages.append({"role": "user", "content": turn[0]}) if turn[1]: messages.append({"role": "assistant", "content": turn[1]}) messages.append({"role": "user", "content": message}) text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer([text], return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, top_p=0.95, top_k=40, repetition_penalty=1.1 ) response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) return response # --- INTERFACE --- with gr.Blocks() as demo: gr.Markdown(f"# Qwen 72B ZeroGPU Test") # Aviso solicitado gr.Markdown( """ ### ⚠️ WARNING: Large Model Inference Test **This model (Qwen 72B) is extremely large.** * **Loading time:** There may be a massive delay during the first initialization. * **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space. """ ) with gr.Accordion("⚙️ Settings", open=False): sys_prompt = gr.Textbox( label="System Prompt", value="You are an expert AI assistant focused on complex coding solutions and software architecture.", lines=2 ) temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature") tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens") chat = gr.ChatInterface( fn=generate, additional_inputs=[sys_prompt, temp, tokens] ) if __name__ == "__main__": download_model_first() demo.launch()