|
|
import gradio as gr |
|
|
import spaces |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
|
|
|
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4" |
|
|
|
|
|
print(f"βοΈ Setting up environment for {MODEL_ID}...") |
|
|
|
|
|
|
|
|
model = None |
|
|
tokenizer = None |
|
|
|
|
|
|
|
|
def download_model_first(): |
|
|
print("β³ Starting preventive weight download (This will take time)...") |
|
|
try: |
|
|
|
|
|
snapshot_download(repo_id=MODEL_ID) |
|
|
print("β
Download complete! Files are cached.") |
|
|
except Exception as e: |
|
|
print(f"β οΈ Warning: Download failed or already exists. Error: {e}") |
|
|
|
|
|
def load_model(): |
|
|
global model, tokenizer |
|
|
if model is None: |
|
|
print(f"π₯ Loading model into VRAM...") |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.float16 |
|
|
) |
|
|
print("β
Qwen 72B is ready!") |
|
|
except Exception as e: |
|
|
print(f"β Critical error loading the model: {e}") |
|
|
raise e |
|
|
return model, tokenizer |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=150) |
|
|
def generate(message, history, system_prompt, temperature, max_tokens): |
|
|
model, tokenizer = load_model() |
|
|
|
|
|
messages = [] |
|
|
if system_prompt: |
|
|
messages.append({"role": "system", "content": system_prompt}) |
|
|
|
|
|
|
|
|
for turn in history: |
|
|
if turn[0]: messages.append({"role": "user", "content": turn[0]}) |
|
|
if turn[1]: messages.append({"role": "assistant", "content": turn[1]}) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
do_sample=True, |
|
|
top_p=0.95, |
|
|
top_k=40, |
|
|
repetition_penalty=1.1 |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) |
|
|
return response |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown(f"# Qwen 72B ZeroGPU Test") |
|
|
|
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### β οΈ WARNING: Large Model Inference Test |
|
|
**This model (Qwen 72B) is extremely large.** |
|
|
* **Loading time:** There may be a massive delay during the first initialization. |
|
|
* **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Accordion("βοΈ Settings", open=False): |
|
|
sys_prompt = gr.Textbox( |
|
|
label="System Prompt", |
|
|
value="You are an expert AI assistant focused on complex coding solutions and software architecture.", |
|
|
lines=2 |
|
|
) |
|
|
temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature") |
|
|
tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens") |
|
|
|
|
|
chat = gr.ChatInterface( |
|
|
fn=generate, |
|
|
additional_inputs=[sys_prompt, temp, tokens] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
download_model_first() |
|
|
demo.launch() |