|
|
|
|
|
import os |
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
from sklearn.metrics import classification_report |
|
|
import joblib |
|
|
|
|
|
DATA_PATH = Path('/mnt/data/text_data.csv') |
|
|
FALLBACK_PATH = Path('data/fallback_text_data.csv') |
|
|
MODEL_PATH = Path('model/textclf_pipeline.joblib') |
|
|
LABELS_PATH = Path('model/labels.joblib') |
|
|
|
|
|
def load_data(): |
|
|
if DATA_PATH.exists(): |
|
|
df = pd.read_csv(DATA_PATH) |
|
|
assert {'text','label'}.issubset(df.columns), "CSV must have 'text' and 'label' columns" |
|
|
return df |
|
|
else: |
|
|
return pd.read_csv(FALLBACK_PATH) |
|
|
|
|
|
def train_and_save(): |
|
|
df = load_data().dropna(subset=['text','label']) |
|
|
X = df['text'].astype(str).values |
|
|
y_text = df['label'].astype(str).values |
|
|
|
|
|
le = LabelEncoder() |
|
|
y = le.fit_transform(y_text) |
|
|
|
|
|
pipe = Pipeline([ |
|
|
('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=20000)), |
|
|
('clf', LogisticRegression(max_iter=1000)) |
|
|
]) |
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) |
|
|
pipe.fit(X_train, y_train) |
|
|
|
|
|
y_pred = pipe.predict(X_val) |
|
|
try: |
|
|
print(classification_report(y_val, y_pred, target_names=le.classes_)) |
|
|
except Exception as e: |
|
|
print("Eval issue:", e) |
|
|
|
|
|
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True) |
|
|
joblib.dump(pipe, MODEL_PATH) |
|
|
joblib.dump(le, LABELS_PATH) |
|
|
print("Saved model to", MODEL_PATH, "and labels to", LABELS_PATH) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
train_and_save() |
|
|
|