Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import DataLoader, Dataset | |
| from torch.optim import AdamW | |
| import matplotlib.pyplot as plt | |
| import matplotlib.animation as animation | |
| import time | |
| from tqdm import tqdm | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments | |
| from diffusers import DiffusionPipeline | |
| from huggingface_hub import login, HfApi, Repository | |
| from dotenv import load_dotenv | |
| import gradio as gr | |
| # Cargar variables de entorno | |
| load_dotenv() | |
| class UnifiedModel(nn.Module): | |
| def __init__(self, models): | |
| super(UnifiedModel, self).__init__() | |
| self.models = nn.ModuleList(models) | |
| self.classifier = nn.Linear(sum([model.config.hidden_size for model in models if hasattr(model, 'config')]), 2) | |
| def forward(self, inputs): | |
| hidden_states = [] | |
| for model in self.models: | |
| if isinstance(model, nn.Module): | |
| outputs = model(**inputs) | |
| hidden_states.append(outputs.last_hidden_state[:, 0, :]) | |
| elif isinstance(model, DiffusionPipeline): | |
| outputs = model(**inputs) | |
| hidden_states.append(torch.tensor(outputs).float()) | |
| concatenated_hidden_states = torch.cat(hidden_states, dim=-1) | |
| logits = self.classifier(concatenated_hidden_states) | |
| return logits | |
| class SyntheticDataset(Dataset): | |
| def __init__(self, tokenizers, size=100): | |
| self.tokenizers = tokenizers | |
| self.size = size | |
| self.data = self._generate_data() | |
| def _generate_data(self): | |
| data = [] | |
| for _ in range(self.size): | |
| text = "This is a sample sentence for testing purposes." | |
| label = torch.tensor(0) # Sample label | |
| item = {"text": text, "label": label} | |
| for name, tokenizer in self.tokenizers.items(): | |
| tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=128) | |
| item[f"input_ids_{name}"] = torch.tensor(tokenized["input_ids"]) | |
| item[f"attention_mask_{name}"] = torch.tensor(tokenized["attention_mask"]) | |
| data.append(item) | |
| return data | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| return self.data[idx] | |
| def push_to_hub(local_dir, repo_name): | |
| try: | |
| repo_url = HfApi().create_repo(repo_name, exist_ok=True) | |
| repo = Repository(local_dir, clone_from=repo_url) | |
| if not os.path.exists(os.path.join(local_dir, ".git")): | |
| os.system(f"cd {local_dir} && git init && git remote add origin {repo_url} && git pull origin main") | |
| repo.git_add(auto_lfs_track=True) | |
| repo.git_commit("Add model and tokenizer files") | |
| json_files = ["config.json", "generation_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer.model", "tokenizer_config.json"] | |
| for json_file in json_files: | |
| json_file_path = os.path.join(local_dir, json_file) | |
| if os.path.exists(json_file_path): | |
| repo.git_add(json_file_path) | |
| repo.git_push() | |
| print(f"Pushed model and tokenizer to {repo_url}") | |
| except Exception as e: | |
| print(f"Error pushing to Hugging Face Hub: {e}") | |
| def load_model(model_name): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| return tokenizer, model | |
| def train(model, train_loader, eval_loader, args): | |
| model.train() | |
| epoch = 0 | |
| total_steps = len(train_loader) | |
| for step, batch in enumerate(train_loader): | |
| start_time = time.time() | |
| input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
| attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
| labels = batch["label"].to("cpu") | |
| optimizer.zero_grad() | |
| outputs = model(input_ids) | |
| loss = nn.CrossEntropyLoss()(outputs, labels) | |
| loss.backward() | |
| optimizer.step() | |
| elapsed_time = time.time() - start_time | |
| estimated_total_time = total_steps * (elapsed_time / (step + 1)) | |
| estimated_remaining_time = estimated_total_time - elapsed_time | |
| if step % args.logging_steps == 0: | |
| train_losses.append(loss.item()) | |
| print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds") | |
| epoch += 1 | |
| model.eval() | |
| eval_loss = 0 | |
| with torch.no_grad(): | |
| for batch in eval_loader: | |
| input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
| attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
| labels = batch["label"].to("cpu") | |
| outputs = model(input_ids) | |
| loss = nn.CrossEntropyLoss()(outputs, labels) | |
| eval_loss += loss.item() | |
| eval_loss /= len(eval_loader) | |
| eval_losses.append(eval_loss) | |
| print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}") | |
| def gradio_interface(input_text): | |
| # Define the Gradio interface function | |
| tokenized_inputs = {name: tokenizer.encode(input_text, return_tensors="pt") for name, tokenizer in tokenizers.items()} | |
| model_output = unified_model(tokenized_inputs) | |
| return model_output | |
| def main(): | |
| while True: | |
| try: | |
| os.system("git config --global credential.helper store") | |
| login(token=os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True) | |
| # Definir los modelos que se van a utilizar | |
| models_to_train = [ | |
| "openai-community/gpt2-xl", | |
| "google/gemma-2-9b-it", | |
| "google/gemma-2-9b", | |
| "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| "meta-llama/Meta-Llama-3.1-8B", | |
| "openbmb/MiniCPM-V-2_6", | |
| "bigcode/starcoder", | |
| "WizardLMTeam/WizardCoder-Python-34B-V1.0", | |
| "Qwen/Qwen2-72B-Instruct", | |
| "google/gemma-2-2b-it", | |
| "facebook/bart-large-cnn", | |
| "Falconsai/text_summarization", | |
| "microsoft/speecht5_tts", | |
| "Groq/Llama-3-Groq-70B-Tool-Use", | |
| "Groq/Llama-3-Groq-8B-Tool-Use", | |
| "facebook/musicgen-large", | |
| "facebook/musicgen-melody", | |
| "black-forest-labs/FLUX.1-schnell", | |
| "facebook/musicgen-small", | |
| "stabilityai/stable-video-diffusion-img2vid-xt-1-1", | |
| "openai/whisper-small", | |
| "black-forest-labs/FLUX.1-dev", | |
| "stabilityai/stable-diffusion-2-1" | |
| ] | |
| # Inicializar los modelos y tokenizadores | |
| tokenizers = {} | |
| models = [] | |
| for model_name in models_to_train: | |
| tokenizer, model = load_model(model_name) | |
| tokenizers[model_name] = tokenizer | |
| models.append(model) | |
| # Crear un dataset sint茅tico para entrenamiento y evaluaci贸n | |
| synthetic_dataset = SyntheticDataset(tokenizers, size=100) | |
| # Dividir el dataset en entrenamiento y evaluaci贸n | |
| train_size = int(0.8 * len(synthetic_dataset)) | |
| val_size = len(synthetic_dataset) - train_size | |
| train_dataset, val_dataset = torch.utils.data.random_split(synthetic_dataset, [train_size, val_size]) | |
| # Crear DataLoaders para entrenamiento y evaluaci贸n | |
| train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) | |
| eval_loader = DataLoader(val_dataset, batch_size=16) | |
| # Unificar los modelos en uno solo | |
| unified_model = UnifiedModel(models) | |
| unified_model.to(torch.device("cpu")) | |
| # Mostrar la cantidad de par谩metros totales a entrenar | |
| total_params = sum(p.numel() for p in unified_model.parameters()) | |
| print(f"Total parameters to train: {total_params}") | |
| # Definir los argumentos de entrenamiento | |
| training_args = TrainingArguments( | |
| per_device_train_batch_size=2, | |
| per_device_eval_batch_size=16, | |
| num_train_epochs=1, | |
| logging_steps=10, | |
| save_steps=10, | |
| evaluation_strategy="steps" | |
| ) | |
| # Definir el optimizador | |
| optimizer = AdamW(unified_model.parameters(), lr=5e-5) | |
| # Listas para almacenar las p茅rdidas | |
| train_losses = [] | |
| eval_losses = [] | |
| # Entrenar el modelo | |
| train(unified_model, train_loader, eval_loader, training_args) | |
| # Visualizar p茅rdidas | |
| fig, ax = plt.subplots() | |
| ax.set_xlabel("Epochs") | |
| ax.set_ylabel("Loss") | |
| ax.plot(train_losses, label="Training Loss") | |
| ax.plot(eval_losses, label="Evaluation Loss") | |
| ax.legend() | |
| def animate(i): | |
| ax.clear() | |
| ax.plot(train_losses, label="Training Loss") | |
| ax.plot(eval_losses, label="Evaluation Loss") | |
| ax.set_xlabel("Epochs") | |
| ax.set_ylabel("Loss") | |
| ax.legend() | |
| ani = animation.FuncAnimation(fig, animate, interval=1000) | |
| plt.show() | |
| # Guardar el modelo y el tokenizador unificados | |
| if not os.path.exists("./outputs/unified_model"): | |
| os.makedirs("./outputs/unified_model") | |
| # Guardar el modelo unificado en un directorio local | |
| local_dir = "./outputs/unified_model" | |
| torch.save(unified_model.state_dict(), os.path.join(local_dir, "pytorch_model.bin")) | |
| # Guardar el tokenizador en un directorio local | |
| for name, tokenizer in tokenizers.items(): | |
| tokenizer.save_pretrained(local_dir) | |
| # Subir el modelo y el tokenizador a Hugging Face | |
| push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model") | |
| # Configurar y lanzar la interfaz Gradio | |
| interface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text") | |
| interface.launch() | |
| break | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| time.sleep(2) | |
| if __name__ == "__main__": | |
| main() | |