Spaces:

eagle0504
/

eda_on_exam_data

Sleeping

App Files Files Community

eda_on_exam_data / build_prediction_model.py

eagle0504

Upload folder using huggingface_hub

3e50e1d verified 3 months ago

raw

history blame contribute delete

2.38 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
	import numpy as np

	# Load the dataset
	df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

	# Rename columns for consistency
	df.rename(columns={
	'parental level of education': 'parental_level_of_education',
	'test preparation course': 'test_preparation_course',
	'math score': 'math_score',
	'reading score': 'reading_score',
	'writing score': 'writing_score',
	'race/ethnicity': 'race_ethnicity'
	}, inplace=True)

	# Create the target variable: average score
	df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

	print("--- Building Predictive Model ---")

	# Define features (X) and target (y)
	X = df.drop(['math_score', 'reading_score', 'writing_score', 'average_score'], axis=1)
	y = df['average_score']

	# One-hot encode categorical features
	X = pd.get_dummies(X, columns=['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course'], drop_first=True)

	# Split data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	print(f"Training features shape: {X_train.shape}")
	print(f"Test features shape: {X_test.shape}")

	# Initialize and train the RandomForestRegressor model
	model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores
	print("\nTraining RandomForestRegressor...")
	model.fit(X_train, y_train)
	print("Model training complete.")

	# Make predictions on the test set
	y_pred = model.predict(X_test)

	# Evaluate the model
	mae = mean_absolute_error(y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)

	print("\n--- Model Evaluation ---")
	print(f"Mean Absolute Error (MAE): {mae:.3f}")
	print(f"Mean Squared Error (MSE): {mse:.3f}")
	print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
	print(f"R-squared (R2): {r2:.3f}")

	# Feature Importances
	print("\n--- Feature Importances ---")
	feature_importances = pd.Series(model.feature_importances_, index=X.columns)
	print(feature_importances.sort_values(ascending=False).head(10))

	print("\nPredictive model building complete.")