import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download

# --- CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"

print(f"⚙️ Setting up environment for {MODEL_ID}...")

# Global Variables (Cache)
model = None
tokenizer = None

# --- EXPLICIT DOWNLOAD FUNCTION ---
def download_model_first():
    print("⏳ Starting preventive weight download (This will take time)...")
    try:
        # Downloads files to Space cache WITHOUT using GPU time
        snapshot_download(repo_id=MODEL_ID)
        print("✅ Download complete! Files are cached.")
    except Exception as e:
        print(f"⚠️ Warning: Download failed or already exists. Error: {e}")

def load_model():
    global model, tokenizer
    if model is None:
        print(f"🔥 Loading model into VRAM...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
            
            # Loads the previously downloaded files
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                device_map="auto", 
                trust_remote_code=True,
                torch_dtype=torch.float16
            )
            print("✅ Qwen 72B is ready!")
        except Exception as e:
            print(f"❌ Critical error loading the model: {e}")
            raise e
    return model, tokenizer

# --- GENERATION FUNCTION (ZEROGPU) ---
@spaces.GPU(duration=150) 
def generate(message, history, system_prompt, temperature, max_tokens):
    model, tokenizer = load_model()
    
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
        
    # Manual history handling
    for turn in history:
        if turn[0]: messages.append({"role": "user", "content": turn[0]})
        if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
    
    messages.append({"role": "user", "content": message})

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response

# --- INTERFACE ---
with gr.Blocks() as demo:
    gr.Markdown(f"# Qwen 72B ZeroGPU Test")
    
    # Aviso solicitado
    gr.Markdown(
        """
        ### ⚠️ WARNING: Large Model Inference Test
        **This model (Qwen 72B) is extremely large.** 
        *   **Loading time:** There may be a massive delay during the first initialization.
        *   **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
        """
    )
    
    with gr.Accordion("⚙️ Settings", open=False):
        sys_prompt = gr.Textbox(
            label="System Prompt", 
            value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
            lines=2
        )
        temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
        tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")

    chat = gr.ChatInterface(
        fn=generate,
        additional_inputs=[sys_prompt, temp, tokens]
    )

if __name__ == "__main__":
    download_model_first()
    demo.launch()