Spaces:

Madras1
/

APILARGE

Running on Zero

App Files Files Community

APILARGE / app.py

Madras1

Update app.py

bf7c25b verified 8 days ago

raw

history blame contribute delete

3.68 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from huggingface_hub import snapshot_download

	# --- CONFIGURATION ---
	MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"

	print(f"⚙️ Setting up environment for {MODEL_ID}...")

	# Global Variables (Cache)
	model = None
	tokenizer = None

	# --- EXPLICIT DOWNLOAD FUNCTION ---
	def download_model_first():
	print("⏳ Starting preventive weight download (This will take time)...")
	try:
	# Downloads files to Space cache WITHOUT using GPU time
	snapshot_download(repo_id=MODEL_ID)
	print("✅ Download complete! Files are cached.")
	except Exception as e:
	print(f"⚠️ Warning: Download failed or already exists. Error: {e}")

	def load_model():
	global model, tokenizer
	if model is None:
	print(f"🔥 Loading model into VRAM...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	# Loads the previously downloaded files
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	device_map="auto",
	trust_remote_code=True,
	torch_dtype=torch.float16
	)
	print("✅ Qwen 72B is ready!")
	except Exception as e:
	print(f"❌ Critical error loading the model: {e}")
	raise e
	return model, tokenizer

	# --- GENERATION FUNCTION (ZEROGPU) ---
	@spaces.GPU(duration=150)
	def generate(message, history, system_prompt, temperature, max_tokens):
	model, tokenizer = load_model()

	messages = []
	if system_prompt:
	messages.append({"role": "system", "content": system_prompt})

	# Manual history handling
	for turn in history:
	if turn[0]: messages.append({"role": "user", "content": turn[0]})
	if turn[1]: messages.append({"role": "assistant", "content": turn[1]})

	messages.append({"role": "user", "content": message})

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = tokenizer([text], return_tensors="pt").to(model.device)

	outputs = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=True,
	top_p=0.95,
	top_k=40,
	repetition_penalty=1.1
	)

	response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
	return response

	# --- INTERFACE ---
	with gr.Blocks() as demo:
	gr.Markdown(f"# Qwen 72B ZeroGPU Test")

	# Aviso solicitado
	gr.Markdown(
	"""
	### ⚠️ WARNING: Large Model Inference Test
	This model (Qwen 72B) is extremely large.
	* Loading time: There may be a massive delay during the first initialization.
	* Test Environment: This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
	"""
	)

	with gr.Accordion("⚙️ Settings", open=False):
	sys_prompt = gr.Textbox(
	label="System Prompt",
	value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
	lines=2
	)
	temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
	tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")

	chat = gr.ChatInterface(
	fn=generate,
	additional_inputs=[sys_prompt, temp, tokens]
	)

	if __name__ == "__main__":
	download_model_first()
	demo.launch()