import os import json from tqdm import tqdm_notebook import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import mean_absolute_error from scipy.sparse import csr_matrix, hstack from sklearn.linear_model import RidgeCV, Ridge from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import cross_val_score from sklearn import preprocessing import scipy import seaborn as sns from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor import math import warnings warnings.filterwarnings("ignore") df = pd.read_csv('Downloads/StudentsPerformance.csv') df.head() df['avg_score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3 df.head() df.shape df.info() df.describe() pd.unique(df['test preparation course']) pd.unique(df['parental level of education']) df.isnull().sum() plt.figure(figsize=(20, 10)) sns.heatmap(df.corr()) plt.figure(figsize=(20, 10)) sns.boxplot(x='gender', y='avg_score', data=df) plt.figure(figsize=(20, 10)) sns.boxplot(y='avg_score', x='test preparation course', hue='gender', data=df) plt.figure(figsize=(20, 10)) sns.countplot(x='race/ethnicity', data=df) plt.figure(figsize=(20, 10)) sns.countplot(x='parental level of education', data=df) sns.pairplot(df) pd.unique(df['parental level of education']) sns.barplot(x='avg_score', hue='parental level of education', data=df) # Function that prints summary statistics of column given in parameters def summary_statistics(col, df): print('Mean: {}'.format(df[col].mean())) print('Max: {}'.format(df[col].max())) print('Min: {}'.format(df[col].min())) print('Median: {}'.format(df[col].median())) print() # Number of students with maximum and minimum score max_score = df[col].max() min_score = df[col].min() print("Number of students who scored maximum score: {}".format(df[col][df[col]==max_score].count())) print("Number of students who scored minimum score: {}".format(df[col][df[col]==min_score].count())) print() # Students close to mean i.e. Students that have scores equal to floor(mean score) or ceiling(mean score) near_mean_floor = math.floor(df[col].mean()) near_mean_ceil = math.ceil(df[col].mean()) near_mean_tot = df[col][df[col]==near_mean_floor].count() + df[col][df[col]==near_mean_ceil].count() print("Number of students close to mean score: {}".format(near_mean_tot)) print() # Students that have 50th percentile print("Number of students at median score: {}".format(df[col][df[col]==df[col].median()].count())) # Students with 25th percentile and 75th percentile scores print("Number of students at 25th percentile: {}".format(df[col][df[col]==df[col].quantile(0.25)].count())) print("Number of students at 75th percentile: {}".format(df[col][df[col]==df[col].quantile(0.75)].count())) summary_statistics("math score", df) summary_statistics("reading score", df) summary_statistics("writing score", df) #Students that have more than median marks in # Maths print("Maths") df_top_math = df[df["math score"] > df["math score"].median()] print(df_top_math["test preparation course"].value_counts()) print() # Reading print("Reading") df_top_read = df[df["reading score"] > df["reading score"].median()] print(df_top_read["test preparation course"].value_counts()) print() # Writing print("Writing") df_top_writ = df[df["writing score"] > df["writing score"].median()] print(df_top_writ["test preparation course"].value_counts()) print() print("Average score") df_top_writ = df[df["avg_score"] > df["avg_score"].median()] print(df_top_writ["test preparation course"].value_counts()) print() #Students that have less than or equal to median marks in # Maths print("Maths") df_bot_math = df[df["math score"] <= df["math score"].median()] print(df_bot_math["test preparation course"].value_counts()) print() # Reading print("Reading") df_bot_read = df[df["reading score"] <= df["reading score"].median()] print(df_bot_read["test preparation course"].value_counts()) print() # Writing print("Writing") df_bot_writ = df[df["writing score"] <= df["writing score"].median()] print(df_bot_writ["test preparation course"].value_counts()) print() print("Average score") df_top_writ = df[df["avg_score"] <= df["avg_score"].median()] print(df_top_writ["test preparation course"].value_counts()) print() def graphs(score_type, suptitle, groupbyterm, kind): nrows = 2 ncols = 3 inches = 5 df_female = df[df['gender'] == 'female'] df_male = df[df['gender'] == 'male'] fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*inches,nrows*inches)) fig.suptitle(suptitle) temp = df[groupbyterm].value_counts().rename("") temp.plot.pie(ax=axes[0,0], title="Students Overall", autopct="%.2f", legend=False) temp = df_female[groupbyterm].value_counts().rename("") temp.plot.pie(ax=axes[0,1], title="Female", autopct="%.2f", legend=False) temp = df_male[groupbyterm].value_counts().rename("") temp.plot.pie(ax=axes[0,2], title="Male", autopct="%.2f", legend=False) pd.concat([ df_female.groupby(groupbyterm)[score_type].mean().rename("Female"), df_male.groupby(groupbyterm)[score_type].mean().rename("Male")], axis=1).plot(kind="bar", ax=axes[1,0], legend=True) axes[1,0].set_xlabel("") axes[1,0].legend([x.get_text().capitalize() for x in axes[1,0].legend().get_texts()]) axes[1,0].set_xticklabels([x.get_text().capitalize() for x in axes[1,0].get_xticklabels()]) df_female.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,1], legend=True, alpha=0.8, histtype="step") axes[1,1].set_xlabel("Female Scores") axes[1,1].set_ylabel("") axes[1,1].set_xticks(np.arange(0, 101, step=20)) axes[1,1].set_yticks(np.arange(0, 101, step=20)) axes[1,1].legend([x.get_text().capitalize() for x in axes[1,1].legend().get_texts()]) df_male.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,2], legend=True, alpha=0.8, histtype="step") axes[1,2].set_xlabel("Male Scores") axes[1,2].set_ylabel("") axes[1,2].set_xticks(np.arange(0, 101, step=20)) axes[1,2].set_yticks(np.arange(0, 101, step=20)) axes[1,2].legend([x.get_text().capitalize() for x in axes[1,2].legend().get_texts()]) return fig, axes fig, axes = graphs("avg_score","Avg Scores by Parental Level of Education", "parental level of education", "hist") fig, axes = graphs("avg_score","Avg Scores by Race/Ethnicity", "race/ethnicity", "hist") fig, axes = graphs("avg_score","Avg Scores by Lunch", "lunch", "hist") fig, axes = graphs("avg_score","Avg Scores by Test Preparation Course", "test preparation course", "hist") score_cols = ['math score', 'reading score','writing score', 'avg_score'] from scipy.stats import norm def Plot_Dist(df, col): fig,axarr = plt.subplots(1,2,figsize=(12,4)) # plot distribution sns.distplot(df[col], fit=norm, kde=False, ax=axarr[0]) #Q-Q plot from statsmodels.graphics.gofplots import qqplot qqplot(df['math score'],line='s', ax=axarr[1]) fig.suptitle(col+' distribution', fontsize=14) plt.show() Plot_Dist(df,col='math score') Plot_Dist(df,col='reading score') Plot_Dist(df,col='writing score') Plot_Dist(df,col='avg_score') ax1=sns.jointplot(x="math score", y="reading score", data=df) plt.show() ax2=sns.jointplot(x="math score", y="writing score", data=df) plt.show() def Plot_Set(df, xcol, ycols): df = df.sort_values(by=xcol) fig,axarr = plt.subplots(1, 4,figsize=(12,5)) for id,ycol in enumerate(ycols): medians = df.groupby([xcol])[ycol].median().values median_labels = [str(np.round(s, 2)) for s in medians] pos = range(len(medians)) sns.boxplot(x=xcol, y=ycol, data=df, width=0.5, palette='Set3', ax=axarr[id], linewidth=0.5) for tick,label in zip(pos,axarr[id].get_xticklabels()): axarr[id].text(pos[tick], medians[tick] + 0.5, median_labels[tick], horizontalalignment='center', size='medium', color='k', weight='semibold') axarr[id].set_ylim([0,105]) plt.setp(axarr[id].get_xticklabels(), rotation=25,ha='right') plt.tight_layout() plt.show() Plot_Set(df, xcol='gender', ycols=['math score','reading score','writing score', 'avg_score']) Plot_Set(df,xcol='race/ethnicity',ycols=['math score','reading score','writing score', 'avg_score']) Plot_Set(df,xcol='parental level of education',ycols=['math score','reading score','writing score', 'avg_score']) Plot_Set(df,xcol='lunch',ycols=['math score','reading score','writing score', 'avg_score']) df.head() df = pd.concat([df, pd.get_dummies(df.gender, prefix='gender_')], axis=1) df = pd.concat([df, pd.get_dummies(df['race/ethnicity'], prefix='race_')], axis=1) df = pd.concat([df, pd.get_dummies(df['parental level of education'], prefix='edu_')], axis=1) df = pd.concat([df, pd.get_dummies(df['lunch'], prefix='lunch_')], axis=1) df = pd.concat([df, pd.get_dummies(df['test preparation course'], prefix='course_')], axis=1) df.drop(columns=['gender', 'race/ethnicity','parental level of education', 'lunch', 'test preparation course'], inplace=True) target = df.avg_score df.drop(columns=['math score', 'reading score', 'writing score', 'avg_score'], inplace=True) X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.33, random_state=17) dtr = DecisionTreeRegressor(max_depth=4, random_state=17) cross_val_score(dtr, df, target, cv=10, scoring='neg_mean_squared_error') dtr = DecisionTreeRegressor(max_depth=4, random_state=17) dtr.fit(X_train, y_train) y_pred = dtr.predict(X_test) mean_absolute_error(y_test, y_pred)