import os
import json
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import scipy
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
import math
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('Downloads/StudentsPerformance.csv')
df.head()

df['avg_score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3
df.head()

df.shape

df.info()

df.describe()

pd.unique(df['test preparation course'])

pd.unique(df['parental level of education'])

df.isnull().sum()

plt.figure(figsize=(20, 10))
sns.heatmap(df.corr())

plt.figure(figsize=(20, 10))
sns.boxplot(x='gender', y='avg_score', data=df)

plt.figure(figsize=(20, 10))
sns.boxplot(y='avg_score', x='test preparation course', hue='gender', data=df)

plt.figure(figsize=(20, 10))
sns.countplot(x='race/ethnicity', data=df)

plt.figure(figsize=(20, 10))
sns.countplot(x='parental level of education', data=df)

sns.pairplot(df)

pd.unique(df['parental level of education'])

sns.barplot(x='avg_score', hue='parental level of education', data=df)

# Function that prints summary statistics of column given in parameters
def summary_statistics(col, df):
    print('Mean: {}'.format(df[col].mean()))
    print('Max: {}'.format(df[col].max()))
    print('Min: {}'.format(df[col].min()))
    print('Median: {}'.format(df[col].median()))
    print()
    
    # Number of students with maximum and minimum score
    max_score = df[col].max()
    min_score = df[col].min()
    print("Number of students who scored maximum score: {}".format(df[col][df[col]==max_score].count()))
    print("Number of students who scored minimum score: {}".format(df[col][df[col]==min_score].count()))
    print()
    
    # Students close to mean i.e. Students that have scores equal to floor(mean score) or ceiling(mean score)
    near_mean_floor = math.floor(df[col].mean())
    near_mean_ceil = math.ceil(df[col].mean())
    near_mean_tot = df[col][df[col]==near_mean_floor].count() + df[col][df[col]==near_mean_ceil].count()
    print("Number of students close to mean score: {}".format(near_mean_tot))
    print()
    
    # Students that have 50th percentile
    print("Number of students at median score: {}".format(df[col][df[col]==df[col].median()].count()))
    
    # Students with 25th percentile and 75th percentile scores
    print("Number of students at 25th percentile: {}".format(df[col][df[col]==df[col].quantile(0.25)].count()))
    print("Number of students at 75th percentile: {}".format(df[col][df[col]==df[col].quantile(0.75)].count()))

summary_statistics("math score", df)

summary_statistics("reading score", df)

summary_statistics("writing score", df)

#Students that have more than median marks in 

# Maths
print("Maths")
df_top_math = df[df["math score"] > df["math score"].median()]
print(df_top_math["test preparation course"].value_counts())
print()

# Reading
print("Reading")
df_top_read = df[df["reading score"] > df["reading score"].median()]
print(df_top_read["test preparation course"].value_counts())
print()

# Writing
print("Writing")
df_top_writ = df[df["writing score"] > df["writing score"].median()]
print(df_top_writ["test preparation course"].value_counts())
print()

print("Average score")
df_top_writ = df[df["avg_score"] > df["avg_score"].median()]
print(df_top_writ["test preparation course"].value_counts())
print()

#Students that have less than or equal to median marks in  

# Maths
print("Maths")
df_bot_math = df[df["math score"] <= df["math score"].median()]
print(df_bot_math["test preparation course"].value_counts())
print()

# Reading
print("Reading")
df_bot_read = df[df["reading score"] <= df["reading score"].median()]
print(df_bot_read["test preparation course"].value_counts())
print()

# Writing
print("Writing")
df_bot_writ = df[df["writing score"] <= df["writing score"].median()]
print(df_bot_writ["test preparation course"].value_counts())
print()

print("Average score")
df_top_writ = df[df["avg_score"] <= df["avg_score"].median()]
print(df_top_writ["test preparation course"].value_counts())
print()

def graphs(score_type, suptitle, groupbyterm, kind):
    nrows = 2
    ncols = 3
    inches = 5
    df_female = df[df['gender'] == 'female']
    df_male = df[df['gender'] == 'male']
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*inches,nrows*inches))
    fig.suptitle(suptitle)
    temp = df[groupbyterm].value_counts().rename("")
    temp.plot.pie(ax=axes[0,0], title="Students Overall", autopct="%.2f", legend=False)
    temp = df_female[groupbyterm].value_counts().rename("")
    temp.plot.pie(ax=axes[0,1], title="Female", autopct="%.2f", legend=False)
    temp = df_male[groupbyterm].value_counts().rename("")
    temp.plot.pie(ax=axes[0,2], title="Male", autopct="%.2f", legend=False)
    pd.concat([
        df_female.groupby(groupbyterm)[score_type].mean().rename("Female"),
        df_male.groupby(groupbyterm)[score_type].mean().rename("Male")], axis=1).plot(kind="bar", ax=axes[1,0], legend=True)
    axes[1,0].set_xlabel("")
    axes[1,0].legend([x.get_text().capitalize() for x in axes[1,0].legend().get_texts()])
    axes[1,0].set_xticklabels([x.get_text().capitalize() for x in axes[1,0].get_xticklabels()])
    df_female.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,1], legend=True, alpha=0.8, histtype="step")
    axes[1,1].set_xlabel("Female Scores")
    axes[1,1].set_ylabel("")
    axes[1,1].set_xticks(np.arange(0, 101, step=20))
    axes[1,1].set_yticks(np.arange(0, 101, step=20))
    axes[1,1].legend([x.get_text().capitalize() for x in axes[1,1].legend().get_texts()])
    df_male.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,2], legend=True, alpha=0.8, histtype="step")
    axes[1,2].set_xlabel("Male Scores")
    axes[1,2].set_ylabel("")
    axes[1,2].set_xticks(np.arange(0, 101, step=20))
    axes[1,2].set_yticks(np.arange(0, 101, step=20))
    axes[1,2].legend([x.get_text().capitalize() for x in axes[1,2].legend().get_texts()])
    return fig, axes

fig, axes = graphs("avg_score","Avg Scores by Parental Level of Education", "parental level of education", "hist")

fig, axes = graphs("avg_score","Avg Scores by Race/Ethnicity", "race/ethnicity", "hist")

fig, axes = graphs("avg_score","Avg Scores by Lunch", "lunch", "hist")

fig, axes = graphs("avg_score","Avg Scores by Test Preparation Course", "test preparation course", "hist")

score_cols = ['math score', 'reading score','writing score', 'avg_score']
from scipy.stats import norm


def Plot_Dist(df, col):
    fig,axarr = plt.subplots(1,2,figsize=(12,4))
    # plot distribution
    sns.distplot(df[col], fit=norm, kde=False, ax=axarr[0])
    #Q-Q plot
    from statsmodels.graphics.gofplots import qqplot
    qqplot(df['math score'],line='s', ax=axarr[1])
    fig.suptitle(col+' distribution', fontsize=14)
    plt.show()

Plot_Dist(df,col='math score')
Plot_Dist(df,col='reading score')
Plot_Dist(df,col='writing score')
Plot_Dist(df,col='avg_score')


ax1=sns.jointplot(x="math score", y="reading score", data=df)
plt.show()

ax2=sns.jointplot(x="math score", y="writing score", data=df)
plt.show()

def Plot_Set(df, xcol, ycols):
    df = df.sort_values(by=xcol)
    fig,axarr = plt.subplots(1, 4,figsize=(12,5))
    for id,ycol in enumerate(ycols):
        medians = df.groupby([xcol])[ycol].median().values
        median_labels = [str(np.round(s, 2)) for s in medians]
        pos = range(len(medians))
        sns.boxplot(x=xcol, y=ycol, data=df, width=0.5, palette='Set3', ax=axarr[id], linewidth=0.5)
        for tick,label in zip(pos,axarr[id].get_xticklabels()):
            axarr[id].text(pos[tick], medians[tick] + 0.5, median_labels[tick], horizontalalignment='center', size='medium', color='k', weight='semibold')
        axarr[id].set_ylim([0,105])
        plt.setp(axarr[id].get_xticklabels(), rotation=25,ha='right')
    plt.tight_layout()
    plt.show()

Plot_Set(df, xcol='gender', ycols=['math score','reading score','writing score', 'avg_score'])

Plot_Set(df,xcol='race/ethnicity',ycols=['math score','reading score','writing score', 'avg_score'])

Plot_Set(df,xcol='parental level of education',ycols=['math score','reading score','writing score', 'avg_score'])

Plot_Set(df,xcol='lunch',ycols=['math score','reading score','writing score', 'avg_score'])

df.head()

df = pd.concat([df, pd.get_dummies(df.gender, prefix='gender_')], axis=1)
df = pd.concat([df, pd.get_dummies(df['race/ethnicity'], prefix='race_')], axis=1)
df = pd.concat([df, pd.get_dummies(df['parental level of education'], prefix='edu_')], axis=1)
df = pd.concat([df, pd.get_dummies(df['lunch'], prefix='lunch_')], axis=1)
df = pd.concat([df, pd.get_dummies(df['test preparation course'], prefix='course_')], axis=1)

df.drop(columns=['gender', 'race/ethnicity','parental level of education', 'lunch', 'test preparation course'], inplace=True)

target = df.avg_score
df.drop(columns=['math score', 'reading score', 'writing score', 'avg_score'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.33, random_state=17)

dtr = DecisionTreeRegressor(max_depth=4, random_state=17)

cross_val_score(dtr, df, target, cv=10, scoring='neg_mean_squared_error')

dtr = DecisionTreeRegressor(max_depth=4, random_state=17)

dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)
mean_absolute_error(y_test, y_pred)