Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score | |
| import numpy as np | |
| # Load the dataset | |
| df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv') | |
| # Rename columns for consistency | |
| df.rename(columns={ | |
| 'parental level of education': 'parental_level_of_education', | |
| 'test preparation course': 'test_preparation_course', | |
| 'math score': 'math_score', | |
| 'reading score': 'reading_score', | |
| 'writing score': 'writing_score', | |
| 'race/ethnicity': 'race_ethnicity' | |
| }, inplace=True) | |
| # Create the target variable: average score | |
| df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1) | |
| print("--- Building Predictive Model ---") | |
| # Define features (X) and target (y) | |
| X = df.drop(['math_score', 'reading_score', 'writing_score', 'average_score'], axis=1) | |
| y = df['average_score'] | |
| # One-hot encode categorical features | |
| X = pd.get_dummies(X, columns=['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course'], drop_first=True) | |
| # Split data into training and testing sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| print(f"Training features shape: {X_train.shape}") | |
| print(f"Test features shape: {X_test.shape}") | |
| # Initialize and train the RandomForestRegressor model | |
| model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores | |
| print("\nTraining RandomForestRegressor...") | |
| model.fit(X_train, y_train) | |
| print("Model training complete.") | |
| # Make predictions on the test set | |
| y_pred = model.predict(X_test) | |
| # Evaluate the model | |
| mae = mean_absolute_error(y_test, y_pred) | |
| mse = mean_squared_error(y_test, y_pred) | |
| rmse = np.sqrt(mse) | |
| r2 = r2_score(y_test, y_pred) | |
| print("\n--- Model Evaluation ---") | |
| print(f"Mean Absolute Error (MAE): {mae:.3f}") | |
| print(f"Mean Squared Error (MSE): {mse:.3f}") | |
| print(f"Root Mean Squared Error (RMSE): {rmse:.3f}") | |
| print(f"R-squared (R2): {r2:.3f}") | |
| # Feature Importances | |
| print("\n--- Feature Importances ---") | |
| feature_importances = pd.Series(model.feature_importances_, index=X.columns) | |
| print(feature_importances.sort_values(ascending=False).head(10)) | |
| print("\nPredictive model building complete.") | |