xxnithicxx's picture
Init project
63255af
# -*- coding: utf-8 -*-
"""
Script to pre-train and save K-Means models for the Gradio app.
Run this once to generate models/ folder with trained models.
"""
import sys
sys.path.insert(0, '../src')
from utils.data_loader import DataLoader
from utils.clustering_models import ClusteringModels
import os
def main():
"""Train and save models."""
print("=" * 70)
print("TRAINING K-MEANS MODELS FOR GRADIO APP")
print("=" * 70)
# Load data
print("\n[1/4] Loading data...")
data_loader = DataLoader("./data/processed")
scaled_features = data_loader.scaled_features
original_features = data_loader.original_features
print(f" Scaled features shape: {scaled_features.shape}")
print(f" Original features shape: {original_features.shape}")
# Initialize clustering models
print("\n[2/4] Initializing clustering models...")
models_dir = "./models"
os.makedirs(models_dir, exist_ok=True)
cm = ClusteringModels(scaled_features, original_features, models_dir)
# Train models
print("\n[3/4] Training K-Means models (k=2 to k=10)...")
cm.train_models(k_range=range(2, 11))
# Apply PCA
print("\n[4/4] Applying PCA for visualization...")
cm.apply_pca(n_components=None) # Keep all components
# Save everything
print("\n[5/5] Saving models to disk...")
cm.save_models()
print("\n" + "=" * 70)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 70)
# Print summary
print("\nSummary:")
print(f" Models saved: {len(cm.kmeans_models)} (k=2 to k={max(cm.kmeans_models.keys())})")
print(f" PCA components: {cm.pca_features.shape[1]}")
print("\n✓ Checking models...")
print("\nSilhouette Scores by K:")
for k, score in zip(range(2, 11), cm.silhouette_scores):
print(f" k={k}: {score:.4f}")
best_k = range(2, 11)[cm.silhouette_scores.index(max(cm.silhouette_scores))]
print(f"\nBest K (by Silhouette Score): {best_k}")
if __name__ == "__main__":
main()