import os import pandas as pd from pathlib import Path from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report import joblib DATA_PATH = Path('/mnt/data/text_data.csv') # optional external data FALLBACK_PATH = Path('data/fallback_text_data.csv') MODEL_PATH = Path('model/textclf_pipeline.joblib') LABELS_PATH = Path('model/labels.joblib') def load_data(): if DATA_PATH.exists(): df = pd.read_csv(DATA_PATH) assert {'text','label'}.issubset(df.columns), "CSV must have 'text' and 'label' columns" return df else: return pd.read_csv(FALLBACK_PATH) def train_and_save(): df = load_data().dropna(subset=['text','label']) X = df['text'].astype(str).values y_text = df['label'].astype(str).values le = LabelEncoder() y = le.fit_transform(y_text) pipe = Pipeline([ ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=20000)), ('clf', LogisticRegression(max_iter=1000)) ]) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_val) try: print(classification_report(y_val, y_pred, target_names=le.classes_)) except Exception as e: print("Eval issue:", e) MODEL_PATH.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pipe, MODEL_PATH) joblib.dump(le, LABELS_PATH) print("Saved model to", MODEL_PATH, "and labels to", LABELS_PATH) if __name__ == "__main__": train_and_save()