Spaces:
Runtime error
Runtime error
| # Importar librerías necesarias | |
| from unsloth import FastLanguageModel | |
| import torch | |
| from dotenv import load_dotenv | |
| import os | |
| import gc | |
| # Cargar variables de entorno | |
| load_dotenv() | |
| token = os.getenv("HF_TOKEN") | |
| # Configuración de parámetros | |
| max_seq_length = 2048 | |
| dtype = None # None para detección automática. Float16 para Tesla T4, V100, Bfloat16 para Ampere+ | |
| load_in_4bit = True # Utilizar cuantización de 4 bits para reducir el uso de memoria | |
| load_in_1bit = True # Utilizar cuantización de 1 bit para una mayor optimización de la memoria | |
| optimize_storage = True # Optimizar el almacenamiento para minimizar el uso del disco | |
| optimize_ram = True # Optimizar el uso de RAM descargando pesos no utilizados | |
| optimize_model_space = True # Optimizar el espacio del modelo eliminando elementos inservibles | |
| # Lista de modelos pre-cuantizados en 4bit y 1bit | |
| quantized_models = [ | |
| "unsloth/mistral-7b-bnb-4bit", | |
| "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", | |
| "unsloth/llama-2-7b-bnb-4bit", | |
| "unsloth/gemma-7b-bnb-4bit", | |
| "unsloth/gemma-7b-it-bnb-4bit", | |
| "unsloth/gemma-2b-bnb-4bit", | |
| "unsloth/gemma-2b-it-bnb-4bit", | |
| "unsloth/gemma-7b-bnb-1bit", # Modelo cuantizado en 1 bit | |
| "unsloth/gemma-2b-bnb-1bit", # Modelo cuantizado en 1 bit | |
| ] | |
| # Cargar el modelo y el tokenizador | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="unsloth/gemma-7b-bnb-1bit", | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| load_in_1bit=load_in_1bit, | |
| optimize_storage=optimize_storage, | |
| optimize_ram=optimize_ram, | |
| optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo | |
| token=token, | |
| ) | |
| # Agregar adaptadores LoRA | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=16, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha=16, | |
| lora_dropout=0, | |
| bias="none", | |
| use_gradient_checkpointing="unsloth", | |
| random_state=3407, | |
| use_rslora=False, | |
| loftq_config=None, | |
| optimize_1bit=True, # Habilitar optimización de 1 bit | |
| ) | |
| # Optimización de almacenamiento, RAM y espacio del modelo | |
| if optimize_storage or optimize_ram or optimize_model_space: | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # Eliminar componentes inservibles del modelo para optimizar el espacio | |
| def prune_model(model): | |
| layers_to_keep = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] | |
| for name, module in model.named_modules(): | |
| if name not in layers_to_keep: | |
| delattr(model, name) | |
| return model | |
| if optimize_model_space: | |
| model = prune_model(model) | |
| if optimize_storage: | |
| model.save_pretrained("optimized_model", max_shard_size="100MB") | |
| if optimize_ram: | |
| model.to_disk("optimized_model", device_map="cpu") | |
| # Preparación de datos | |
| from datasets import load_dataset | |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {} | |
| ### Input: | |
| {} | |
| ### Response: | |
| {}""" | |
| EOS_TOKEN = tokenizer.eos_token | |
| def formatting_prompts_func(examples): | |
| instructions = examples["instruction"] | |
| inputs = examples["input"] | |
| outputs = examples["output"] | |
| texts = [] | |
| for instruction, input, output in zip(instructions, inputs, outputs): | |
| text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN | |
| texts.append(text) | |
| return {"text": texts} | |
| dataset = load_dataset("yahma/alpaca-cleaned", split="train") | |
| dataset = dataset.map(formatting_prompts_func, batched=True) | |
| # Entrenamiento del modelo | |
| from trl import SFTTrainer | |
| from transformers import TrainingArguments | |
| from unsloth import is_bfloat16_supported | |
| trainer = SFTTrainer( | |
| model=model, | |
| tokenizer=tokenizer, | |
| train_dataset=dataset, | |
| dataset_text_field="text", | |
| max_seq_length=max_seq_length, | |
| dataset_num_proc=20, | |
| packing=False, | |
| args=TrainingArguments( | |
| per_device_train_batch_size=2, | |
| gradient_accumulation_steps=4, | |
| warmup_steps=5, | |
| max_steps=60, | |
| learning_rate=8e-4, | |
| fp16=not is_bfloat16_supported(), | |
| bf16=is_bfloat16_supported(), | |
| logging_steps=1, | |
| optim="adamw_8bit", | |
| weight_decay=0.01, | |
| lr_scheduler_type="linear", | |
| seed=3407, | |
| output_dir="outputs", | |
| ), | |
| ) | |
| # Mostrar estadísticas de memoria actuales | |
| gpu_stats = torch.cuda.get_device_properties(0) | |
| start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) | |
| max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) | |
| print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") | |
| print(f"{start_gpu_memory} GB of memory reserved.") | |
| # Entrenar el modelo | |
| trainer_stats = trainer.train() | |
| # Mostrar estadísticas finales de memoria y tiempo | |
| used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) | |
| used_memory_for_lora = round(used_memory - start_gpu_memory, 3) | |
| used_percentage = round(used_memory / max_memory * 100, 3) | |
| lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) | |
| print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") | |
| print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.") | |
| print(f"Peak reserved memory = {used_memory} GB.") | |
| print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") | |
| print(f"Peak reserved memory % of max memory = {used_percentage} %.") | |
| print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") | |
| # Inferencia | |
| FastLanguageModel.for_inference(model) | |
| inputs = tokenizer( | |
| [ | |
| alpaca_prompt.format( | |
| "Continue the fibonacci sequence.", | |
| "1, 1, 2, 3, 5, 8", | |
| "", | |
| ) | |
| ], return_tensors="pt").to("cuda") | |
| outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) | |
| print(tokenizer.batch_decode(outputs)) | |
| # Inferencia continua usando TextStreamer | |
| from transformers import TextStreamer | |
| text_streamer = TextStreamer(tokenizer) | |
| inputs = tokenizer( | |
| [ | |
| alpaca_prompt.format( | |
| "Continue the fibonacci sequence.", | |
| "1, 1, 2, 3, 5, 8", | |
| "", | |
| ) | |
| ], return_tensors="pt").to("cuda") | |
| _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) | |
| # Guardar y cargar modelos fine-tuned | |
| model.save_pretrained("lora_model") | |
| tokenizer.save_pretrained("lora_model") | |
| if True: | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="lora_model", | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| load_in_1bit=load_in_1bit, | |
| optimize_storage=optimize_storage, | |
| optimize_ram=optimize_ram, | |
| optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo | |
| ) | |
| FastLanguageModel.for_inference(model) | |
| inputs = tokenizer( | |
| [ | |
| alpaca_prompt.format( | |
| "What is a famous tall tower in Paris?", | |
| "", | |
| "", | |
| ) | |
| ], return_tensors="pt").to("cuda") | |
| outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) | |
| print(tokenizer.batch_decode(outputs)) | |
| # Guardar en float16 para VLLM | |
| if True: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit",) | |
| if True: model.push_to_hub_merged("Yjhhh/model", tokenizer, save_method="merged_16bit", token=token) | |
| # Guardar en formato GGUF | |
| if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_0") | |
| if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_0", token=token) | |
| if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_1") | |
| if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_1", token=token) | |
| if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8") | |
| if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8", token=token) | |
| if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0") | |
| if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_0", token=token) | |
| if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_1") | |
| if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_1", token=token) | |