import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Data loading
df = pd.read_csv('grav.csv')
df.head()
height | seconds squared | |
---|---|---|
0 | 100 | 8.967 |
1 | 120 | 11.989 |
2 | 150 | 15.852 |
3 | 180 | 16.991 |
4 | 200 | 20.284 |
#selection de la première colonne de notre dataset (height)
X = df.iloc[0:len(df),0]
#selection de deuxième colonnes de notre dataset (seconds squared)
Y = df.iloc[0:len(df),1]
axes = plt.axes()
#dessiner une grille
axes.grid()
#X et Y sont les variables qu'on a extraite dans le partie précédente
plt.scatter(X,Y)
plt.show()
from scipy import stats
#linregress() renvoie plusieurs variables de retour. On s'interesse particulierement au slope et intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(X, Y)
def predict(x):
return slope * x + intercept
#la variable fitLine sera un tableau de valeurs prédites depuis la tableau de variables X
fitLine = predict(X)
plt.plot(X, fitLine, c='r')
[<matplotlib.lines.Line2D at 0x240b5820940>]
print (slope)
0.10280912511759172
print (intercept)
-0.6801232361241745
!pip install pyreadstat
Requirement already satisfied: pyreadstat in c:\users\sirine\anaconda3\lib\site-packages (1.0.5) Requirement already satisfied: pandas>0.24.0 in c:\users\sirine\anaconda3\lib\site-packages (from pyreadstat) (1.0.5) Requirement already satisfied: pytz>=2017.2 in c:\users\sirine\anaconda3\lib\site-packages (from pandas>0.24.0->pyreadstat) (2020.1) Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\sirine\anaconda3\lib\site-packages (from pandas>0.24.0->pyreadstat) (2.8.1) Requirement already satisfied: numpy>=1.13.3 in c:\users\sirine\anaconda3\lib\site-packages (from pandas>0.24.0->pyreadstat) (1.18.5) Requirement already satisfied: six>=1.5 in c:\users\sirine\anaconda3\lib\site-packages (from python-dateutil>=2.6.1->pandas>0.24.0->pyreadstat) (1.15.0)
import pyreadstat
df, metadata = pyreadstat.read_sav("test_scores.sav")
df.head(10)
school | school_setting | school_type | classroom | teaching_method | n_student | student_id | gender | lunch | pretest | posttest | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 2FHT3 | 1.0 | 2.0 | 62.0 | 72.0 |
1 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 3JIVH | 1.0 | 2.0 | 66.0 | 79.0 |
2 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 3XOWE | 0.0 | 2.0 | 64.0 | 76.0 |
3 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 556O0 | 1.0 | 2.0 | 61.0 | 77.0 |
4 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 74LOE | 0.0 | 2.0 | 64.0 | 76.0 |
5 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 7YZO8 | 1.0 | 2.0 | 66.0 | 74.0 |
6 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 9KMZD | 0.0 | 2.0 | 63.0 | 75.0 |
7 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | 9USQK | 1.0 | 2.0 | 63.0 | 72.0 |
8 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | CS5QP | 0.0 | 2.0 | 64.0 | 77.0 |
9 | ANKYI | 1.0 | 2.0 | 6OL | 0.0 | 20.0 | D6HT8 | 1.0 | 2.0 | 61.0 | 72.0 |
# Shape of our dataset
df.shape
# Info our dataset
df.info()
# Describe our dataset
df.describe()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2133 entries, 0 to 2132 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 school 2133 non-null object 1 school_setting 2133 non-null float64 2 school_type 2133 non-null float64 3 classroom 2133 non-null object 4 teaching_method 2133 non-null float64 5 n_student 2133 non-null float64 6 student_id 2133 non-null object 7 gender 2133 non-null float64 8 lunch 2133 non-null float64 9 pretest 2133 non-null float64 10 posttest 2133 non-null float64 dtypes: float64(8), object(3) memory usage: 183.4+ KB
school_setting | school_type | teaching_method | n_student | gender | lunch | pretest | posttest | |
---|---|---|---|---|---|---|---|---|
count | 2133.000000 | 2133.000000 | 2133.000000 | 2133.000000 | 2133.000000 | 2133.000000 | 2133.000000 | 2133.000000 |
mean | 1.814346 | 1.258322 | 0.356306 | 22.796531 | 0.495077 | 1.569620 | 54.955931 | 67.102203 |
std | 0.793525 | 0.437814 | 0.479019 | 4.228893 | 0.500093 | 0.495245 | 13.563101 | 13.986789 |
min | 1.000000 | 1.000000 | 0.000000 | 14.000000 | 0.000000 | 1.000000 | 22.000000 | 32.000000 |
25% | 1.000000 | 1.000000 | 0.000000 | 20.000000 | 0.000000 | 1.000000 | 44.000000 | 56.000000 |
50% | 2.000000 | 1.000000 | 0.000000 | 22.000000 | 0.000000 | 2.000000 | 56.000000 | 68.000000 |
75% | 2.000000 | 2.000000 | 1.000000 | 27.000000 | 1.000000 | 2.000000 | 65.000000 | 77.000000 |
max | 3.000000 | 2.000000 | 1.000000 | 31.000000 | 1.000000 | 2.000000 | 93.000000 | 100.000000 |
# Using pairplot we'll visualize the data for correlation
sns.pairplot(df, x_vars=['school','school_setting','school_type','classroom','teaching_method','n_student','student_id','gender', 'lunch','pretest'],
y_vars='posttest', size=4, aspect=1, kind='scatter')
plt.show()
C:\Users\sirine\anaconda3\lib\site-packages\seaborn\axisgrid.py:2071: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
# Visualizing the data using heatmap
sns.heatmap(df.corr(), cmap="YlGnBu", annot = True)
plt.show()
# Creating X and y
X = df['pretest']
y = df['posttest']
plt.scatter(X,y)
plt.xlabel("pretest")
plt.ylabel("posttest")
Text(0, 0.5, 'posttest')
# Splitting the varaibles as training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7,
test_size = 0.3, random_state = 100)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-9-34727fdb23d8> in <module> 1 # Splitting the varaibles as training and testing 2 from sklearn.model_selection import train_test_split ----> 3 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, 4 test_size = 0.3, random_state = 100) NameError: name 'y' is not defined
# Take a look at the train dataset
X_train
y_train
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-10-d84a877a88e6> in <module> 1 # Take a look at the train dataset ----> 2 X_train 3 y_train NameError: name 'X_train' is not defined
# Importing Statsmodels.api library from Stamodel package
import statsmodels.api as sm
# Adding a constant to get an intercept
X_train_sm = sm.add_constant(X_train)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-11-d3fe73289f10> in <module> 3 4 # Adding a constant to get an intercept ----> 5 X_train_sm = sm.add_constant(X_train) NameError: name 'X_train' is not defined
# Fitting the resgression line using 'OLS'
lr = sm.OLS(y_train, X_train_sm).fit()
# Printing the parameters
lr.params
# Performing a summary to list out all the different parameters of the regression line fitted
lr.summary()
# Visualizing the regression line
plt.scatter(X_train, y_train)
plt.plot(X_train, 13.3750 + 0.9783*X_train, 'r')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-12-8dda201fd718> in <module> 1 # Visualizing the regression line ----> 2 plt.scatter(X_train, y_train) 3 plt.plot(X_train, 13.3750 + 0.9783*X_train, 'r') 4 plt.show() NameError: name 'X_train' is not defined
# Predicting y_value using traingn data of X
y_train_pred = lr.predict(X_train_sm)
# Creating residuals from the y_train data and predicted y_data
res = (y_train - y_train_pred)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-13-a2ae6739e1e8> in <module> 1 # Predicting y_value using traingn data of X ----> 2 y_train_pred = lr.predict(X_train_sm) 3 4 # Creating residuals from the y_train data and predicted y_data 5 res = (y_train - y_train_pred) NameError: name 'lr' is not defined
# Plotting the histogram using the residual values
fig = plt.figure()
sns.distplot(res, bins = 15)
plt.title('Error Terms', fontsize = 15)
plt.xlabel('y_train - y_train_pred', fontsize = 15)
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-14-6e074e4b2041> in <module> 1 # Plotting the histogram using the residual values 2 fig = plt.figure() ----> 3 sns.distplot(res, bins = 15) 4 plt.title('Error Terms', fontsize = 15) 5 plt.xlabel('y_train - y_train_pred', fontsize = 15) NameError: name 'res' is not defined
<Figure size 432x288 with 0 Axes>
# Looking for any patterns in the residuals
plt.scatter(X_train,res)
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-15-c6fac56354f2> in <module> 1 # Looking for any patterns in the residuals ----> 2 plt.scatter(X_train,res) 3 plt.show() NameError: name 'X_train' is not defined
# Adding a constant to X_test
X_test_sm = sm.add_constant(X_test)
# Predicting the y values corresponding to X_test_sm
y_test_pred = lr.predict(X_test_sm)
# Printing the first 15 predicted values
y_test_pred
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-16-ddc9bfb50c79> in <module> 1 # Adding a constant to X_test ----> 2 X_test_sm = sm.add_constant(X_test) 3 4 # Predicting the y values corresponding to X_test_sm 5 y_test_pred = lr.predict(X_test_sm) NameError: name 'X_test' is not defined
# Importing r2_square
from sklearn.metrics import r2_score
# Checking the R-squared value
r_squared = r2_score(y_test, y_test_pred)
r_squared
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-17-d5000fb5184a> in <module> 3 4 # Checking the R-squared value ----> 5 r_squared = r2_score(y_test, y_test_pred) 6 r_squared NameError: name 'y_test' is not defined
# Visualize the line on the test set
plt.scatter(X_test, y_test)
plt.plot(X_test, y_test_pred, 'r')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-18-6c4d17cd0214> in <module> 1 # Visualize the line on the test set ----> 2 plt.scatter(X_test, y_test) 3 plt.plot(X_test, y_test_pred, 'r') 4 plt.show() NameError: name 'X_test' is not defined
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train_lm, X_test_lm, y_train_lm, y_test_lm = train_test_split(X, y, train_size = 0.7,
test_size = 0.3, random_state = 100)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-19-6d75773ddd9f> in <module> 1 # Splitting the data into train and test 2 from sklearn.model_selection import train_test_split ----> 3 X_train_lm, X_test_lm, y_train_lm, y_test_lm = train_test_split(X, y, train_size = 0.7, 4 test_size = 0.3, random_state = 100) NameError: name 'y' is not defined
# Shape of the train set without adding column
X_train_lm.shape
# Adding additional column to the train and test data
X_train_lm = X_train_lm.values.reshape(-1,1)
X_test_lm = X_test_lm.values.reshape(-1,1)
print(X_train_lm.shape)
print(X_test_lm.shape)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-20-85af6eca8381> in <module> 1 # Shape of the train set without adding column ----> 2 X_train_lm.shape 3 4 # Adding additional column to the train and test data 5 X_train_lm = X_train_lm.values.reshape(-1,1) NameError: name 'X_train_lm' is not defined
from sklearn.linear_model import LinearRegression
# Creating an object of Linear Regression
lm = LinearRegression()
# Fit the model using .fit() method
lm.fit(X_train_lm, y_train_lm)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-21-94aa0ebafdeb> in <module> 5 6 # Fit the model using .fit() method ----> 7 lm.fit(X_train_lm, y_train_lm) NameError: name 'X_train_lm' is not defined
# Intercept value
print("Intercept :",lm.intercept_)
# Slope value
print('Slope :',lm.coef_)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-22-76c377604146> in <module> 1 # Intercept value ----> 2 print("Intercept :",lm.intercept_) 3 4 # Slope value 5 print('Slope :',lm.coef_) AttributeError: 'LinearRegression' object has no attribute 'intercept_'
# Making Predictions of y_value
y_train_pred = lm.predict(X_train_lm)
y_test_pred = lm.predict(X_test_lm)
# Comparing the r2 value of both train and test data
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-23-729f77bced32> in <module> 1 # Making Predictions of y_value ----> 2 y_train_pred = lm.predict(X_train_lm) 3 y_test_pred = lm.predict(X_test_lm) 4 5 # Comparing the r2 value of both train and test data NameError: name 'X_train_lm' is not defined
#loading the data
boston = pd.read_csv('housing.data.txt' , sep=",")
print("housing's shape : ", boston.shape)
#print the first 10 rows of the data
boston.head(10)
housing's shape : (506, 14)
CIRM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
5 | 0.02985 | 0.0 | 2.18 | 0 | 0.458 | 6.430 | 58.7 | 6.0622 | 3 | 222.0 | 18.7 | 394.12 | 5.21 | 28.7 |
6 | 0.08829 | 12.5 | 7.87 | 0 | 0.524 | 6.012 | 66.6 | 5.5605 | 5 | 311.0 | 15.2 | 395.60 | 12.43 | 22.9 |
7 | 0.14455 | 12.5 | 7.87 | 0 | 0.524 | 6.172 | 96.1 | 5.9505 | 5 | 311.0 | 15.2 | 396.90 | 19.15 | 27.1 |
8 | 0.21124 | 12.5 | 7.87 | 0 | 0.524 | 5.631 | 100.0 | 6.0821 | 5 | 311.0 | 15.2 | 386.63 | 29.93 | 16.5 |
9 | 0.17004 | 12.5 | 7.87 | 0 | 0.524 | 6.004 | 85.9 | 6.5921 | 5 | 311.0 | 15.2 | 386.71 | 17.10 | 18.9 |
After loading the data, we can check if there are any missing values in the data. We count the number of missing values for each feature using isnull().
boston.isnull().sum()
CIRM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 B 0 LSTAT 0 MEDV 0 dtype: int64
Exploratory Data Analysis is a very important step before training the model. In this section, we will use some visualizations to understand the relationship of the target variable with other features.
#plot the distribution of the target variable MEDV
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(boston['MEDV'], bins=30)
plt.show()
We see that the values of MEDV are distributed normally with few outliers.
Next, we create a correlation matrix that measures the linear relationships between the variables. The correlation matrix can be formed by using the corr function from the pandas dataframe library. We will use the heatmap function from the seaborn library to plot the correlation matrix.
correlation_matrix = boston.corr().round(2)
#annot = True to print the values inside the square
sns.heatmap(data=correlation_matrix, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x240b5846370>
The correlation coefficient ranges from -1 to 1. If the value is close to 1, it means that there is a strong positive correlation between the two variables. When it is close to -1, the variables have a strong negative correlation.
To fit a linear regression model, we select those features which have a high correlation with our target variable MEDV. By looking at the correlation matrix we can see that RM has a strong positive correlation with MEDV (0.7) where as LSTAT has a high negative correlation with MEDV(-0.74).
An important point in selecting features for a linear regression model is to check for multi-co-linearity. The features RAD, TAX have a correlation of 0.91. These feature pairs are strongly correlated to each other. We should not select both these features together for training the model. Same goes for the features DIS and AGE which have a correlation of -0.75.
Based on the above observations we will RM and LSTAT as our features. Using a scatter plot let’s see how these features vary with MEDV.
plt.figure(figsize=(20, 5))
features = ['LSTAT', 'RM']
target = boston['MEDV']
for i, col in enumerate(features):
plt.subplot(1, len(features) , i+1)
x = boston[col]
y = target
plt.scatter(x, y, marker='o')
plt.title(col)
plt.xlabel(col)
plt.ylabel('MEDV')
The prices increase as the value of RM increases linearly. There are few outliers and the data seems to be capped at 50.
The prices tend to decrease with an increase in LSTAT. Though it doesn’t look to be following exactly a linear line.
We concatenate the LSTAT and RM columns using np.c_ provided by the numpy library.
X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM']], columns = ['LSTAT','RM'])
Y = boston['MEDV']
Next, we split the data into training and testing sets. We train the model with 70% of the samples and test with the remaining 30%. We do this to assess the model’s performance on unseen data. To split the data we use train_test_split function provided by scikit-learn library. We finally print the sizes of our training and test set to verify if the splitting has occurred properly.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(354, 2) (152, 2) (354,) (152,)
We use scikit-learn’s LinearRegression to train our model on both the training and test sets.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
LinearRegression()
We will evaluate our model using RMSE and R2-score.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# model evaluation for training set
y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)
print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")
# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)
print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
The model performance for training set -------------------------------------- RMSE is 5.291267334085863 R2 score is 0.6493177656003104 The model performance for testing set -------------------------------------- RMSE is 6.041044703455955 R2 score is 0.6162026947666575