6100.Model.Soares / train.py
sjsoares's picture
Upload 3 files
14a4f85 verified
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib
DATA_PATH = Path('/mnt/data/text_data.csv') # optional external data
FALLBACK_PATH = Path('data/fallback_text_data.csv')
MODEL_PATH = Path('model/textclf_pipeline.joblib')
LABELS_PATH = Path('model/labels.joblib')
def load_data():
if DATA_PATH.exists():
df = pd.read_csv(DATA_PATH)
assert {'text','label'}.issubset(df.columns), "CSV must have 'text' and 'label' columns"
return df
else:
return pd.read_csv(FALLBACK_PATH)
def train_and_save():
df = load_data().dropna(subset=['text','label'])
X = df['text'].astype(str).values
y_text = df['label'].astype(str).values
le = LabelEncoder()
y = le.fit_transform(y_text)
pipe = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=20000)),
('clf', LogisticRegression(max_iter=1000))
])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
try:
print(classification_report(y_val, y_pred, target_names=le.classes_))
except Exception as e:
print("Eval issue:", e)
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, MODEL_PATH)
joblib.dump(le, LABELS_PATH)
print("Saved model to", MODEL_PATH, "and labels to", LABELS_PATH)
if __name__ == "__main__":
train_and_save()