# -*- coding: utf-8 -*- """ Script to pre-train and save K-Means models for the Gradio app. Run this once to generate models/ folder with trained models. """ import sys sys.path.insert(0, '../src') from utils.data_loader import DataLoader from utils.clustering_models import ClusteringModels import os def main(): """Train and save models.""" print("=" * 70) print("TRAINING K-MEANS MODELS FOR GRADIO APP") print("=" * 70) # Load data print("\n[1/4] Loading data...") data_loader = DataLoader("./data/processed") scaled_features = data_loader.scaled_features original_features = data_loader.original_features print(f" Scaled features shape: {scaled_features.shape}") print(f" Original features shape: {original_features.shape}") # Initialize clustering models print("\n[2/4] Initializing clustering models...") models_dir = "./models" os.makedirs(models_dir, exist_ok=True) cm = ClusteringModels(scaled_features, original_features, models_dir) # Train models print("\n[3/4] Training K-Means models (k=2 to k=10)...") cm.train_models(k_range=range(2, 11)) # Apply PCA print("\n[4/4] Applying PCA for visualization...") cm.apply_pca(n_components=None) # Keep all components # Save everything print("\n[5/5] Saving models to disk...") cm.save_models() print("\n" + "=" * 70) print("TRAINING COMPLETED SUCCESSFULLY!") print("=" * 70) # Print summary print("\nSummary:") print(f" Models saved: {len(cm.kmeans_models)} (k=2 to k={max(cm.kmeans_models.keys())})") print(f" PCA components: {cm.pca_features.shape[1]}") print("\n✓ Checking models...") print("\nSilhouette Scores by K:") for k, score in zip(range(2, 11), cm.silhouette_scores): print(f" k={k}: {score:.4f}") best_k = range(2, 11)[cm.silhouette_scores.index(max(cm.silhouette_scores))] print(f"\nBest K (by Silhouette Score): {best_k}") if __name__ == "__main__": main()