# Check how many values are missing in the category_desc column
print(volunteer['category_desc'].isnull().sum())
import missingno as msno msno.heatmap(df) #visualizes correlation msno.dendrogram(df) #shows tree diagram of correlation
# compares missingness between missing and non-missing data
def fill_dummy_values(df, scaling_factor=1):
df_dummy = df.copy(deep=True)
for col in df_dummy:
col=df_dummy[col]
col_null=col.isnull()
num_nulls=col_null.sum()
col_range=col.max()-col.min()
dummy_values=(rand(num_nulls)-2)*scaling_factor*col_range+col.min()
col[col_null]=dummy_values
return df_dummy
#can visualize results with a scatterplot
# Fill dummy values in diabetes_dummy
df_dummy = fill_dummy_values()
# Sum the nullity of one column and another column
nullity = df[col_name].isna() + diabetes[col_name].isna()
# Create a scatter plot of Skin Fold and BMI
diabetes_dummy.plot(x='col_name', y='col_name', kind='scatter', alpha=0.5, c= nullity, cmap='rainbow')
#visualizing missing data
import missingno as msno
msno.bar(df) #visualizes missing data as a bar chart (remember to plt.show)
#visualizing missing data
import missingno as msno
msno.matrix(df) #shows missing data and can parse through date frames
pairwise - skips missing value (automatically happens in pandas)
listwise - using df.dropna() to remove data by row or column only use when missing data is MCAR
# Drop all rows where Gender is missing
no_gender = so_survey_df.dropna(subset=['Gender'])
replacing missing values with another value
preferred method for time-series data
example:
df.interpolate(method='linear', inplace=True)
#Simple Imputer
from sklearn.impute import SimpleImputer
df_copy = df.copy(deep=True) #makes copy for comparison to original
si = SimpleImputer(strategy='', fill_value=#constant) #mean, median, constant, most-frequent(mode)
df_copy.iloc[:,:] = si.fit_transform(df_copy)
from fancyimpute import KNN, IterateImputer
#KNN uses K nearest neighbor to replace values
#IterateImputer uses multiple regressions to replace values (most robust)
example:
\
ki = KNN()
df_copy = df.copy(deep=True) #make copy for comparison to original
df_copy.iloc[:,:] = ki.fit_transform(df_copy)
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
# import models to use for imputation
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
imputer = IterativeImputer(BayesianRidge()) #insert model to use and arguments
impute_data = pd.DataFrame(imputer.fit_transform(full_data))
convert, then impute if data are strings then fill Nan using most frequent value (KNN)
# iteartiveimputer for categorical
imputer = IterativeImputer(ExtraTreesRegressor()) #use model and arguments
# impute data and convert
encode_data = pd.DataFrame(np.round(imputer.fit_transform(impute_data)),columns = impute_data.columns)
#function that will loop through each column and encode strings to integers, then impute missing
#values with KNN and return those columns back to the original dataframe
# Create an empty dictionary ordinal_enc_dict
from sklearn.preprocessing import OrdinalEncoder
ordinal_enc_dict = {}
def cat_data_imputer(df):
for col_name in df:
# Create Ordinal encoder for col
ordinal_enc_dict[col_name] = OrdinalEncoder()
col = df[col_name]
# Select non-null values of col in users
col_not_null = col[col.notnull()]
reshaped_vals = col_not_null.values.reshape(-1, 1)
encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
# Store the values to column in users
df.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
# Create KNN imputer
KNN_imputer = KNN()
# Impute and round the users DataFrame
df.iloc[:, :] = np.round(KNN_imputer.fit_transform(df))
# Loop over the column names in users
for col_name in df:
# Reshape the data
reshaped = df[col_name].values.reshape(-1, 1)
# Perform inverse transform of the ordinally encoded columns
df[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype(int)
takes into account data split when splitting into train and test sets
# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)
# Print out the category_desc counts on the training y labels
print(y_train['category_desc'].value_counts())
As categorical variables need to be treated in a particular manner, as you'll see later on, you need to make sure to identify which variables are categorical. In some cases, identifying will be easy (e.g. if they are stored as strings), in other cases they are numeric and the fact that they are categorical is not always immediately apparent. Note that this may not be trivial. A first thing you can do is use the .describe()
function and .info()
-function and get a better sense. .describe()
will give you info on the data types (like strings, integers, etc), but even then continuous variables might have been imported as strings, so it's very important to really have a look at your data.
When you want to use categorical variables in regression models, they need to be transformed. There are two approaches to this:
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
origin_encoded = lb_make.fit_transform(cat_origin)
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
#pandas
pd.get_dummies(cat_origin)
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
#sklearn
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
origin_dummies = lb.fit_transform(cat_origin)
# you need to convert this back to a dataframe
origin_dum_df = pd.DataFrame(origin_dummies,columns=lb.classes_)
creating new columns that is used as an either/or
# Create the Paid_Job column filled with zeros
so_survey_df['Paid_Job'] = 0
# Replace all the Paid_Job values where ConvertedSalary is > 0
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1
# Print the first five rows of the columns
print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())
creating bins that group specific numeroic ranges together
# Bin the continuous variable ConvertedSalary into 5 bins
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], bins = 5)
# Print the first 5 rows of the equal_binned column
print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
# Import numpy
import numpy as np
# Specify the boundaries of the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]
# Bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']
# Bin the continuous variable ConvertedSalary using these boundaries
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
labels=labels, bins=bins)
# Print the first 5 rows of the boundary_binned column
print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
#instantiate both packages to use
encoder = OrdinalEncoder()
# create a list of categorical columns to iterate over
cat_cols = ['embarked','class1','deck1','who','embark_town','sex','adult_male','alive','alone']
def encode(data):
'''function to encode non-null data and replace it in the original data'''
#retains only non-null values
nonulls = np.array(data.dropna())
#reshapes the data for encoding
impute_reshape = nonulls.reshape(-1,1)
#encode date
impute_ordinal = encoder.fit_transform(impute_reshape)
#Assign back encoded values to non-null values
data.loc[data.notnull()] = np.squeeze(impute_ordinal)
return data
#create a for loop to iterate through each column in the data
for columns in cat_cols:
encode(data[columns])
Because the idea behind regression is that you can change one variable and keep the others constant, correlation is a problem, because it indicates that changes in one predictor are associated with changes in another one as well. Because of this, the estimates of the coefficients can have big fluctuations as a result of small changes in the model. As a result, you may not be able to trust the p-values associated with correlated predictors.
pd.plotting.scatter_matrix(data,figsize = [11, 11]);
data.corr()
import seaborn as sns
sns.heatmap(data_pred.corr(), center=0);
The idea behind this is that, around every point of the regression line, we would assume the data is spread around the eventual regression line in a "homogenous" way, with more points closer to the regression line and less points further away.
Often, your dataset will contain features that largely vary in magnitudes. If we leave these magnitudes unchanged, coefficient sizes will largely fluctuate in magnitude as well. This can give the false impression that some variables are less important than others.
Even though this is not always a formal issue when estimating linear regression models, this can be an issue in more advanced machine learning models. This is because most machine learning algorithms use Euclidean distance between two data points in their computations. Because of that, making sure that features have similar scales is formally required there. Some algorithms even require features to be zero centric.
A good rule of thumb is, however, to check your features for normality, and while you're at it, scale your features so they have similar magnitudes, even for a "simple" model like linear regression.
Log transformation is a very useful tool when you have data that clearly does not follow a normal distribution. log transformation can help reducing skewness when you have skewed data, and can help reducing variability of data.
import numpy as np
data_log= pd.DataFrame([])
data_log["column"] = np.log(data["column"])
When performing min-max scaling, you can transform x to get the transformed $x'$ by using the formula: $$x' = \dfrac{x - \min(x)}{\max(x)-\min(x)}$$ This way of scaling brings values between 0 and 1
features_final["CRIM"] = (logcrim-min(logcrim))/(max(logcrim)-min(logcrim))
scaler = MinMaxScaler()
scaler.fit(data['column'])
When $$x' = \dfrac{x - \bar x}{\sigma}$$ x' will have mean $\mu = 0$ and $\sigma = 1$ Note that standardization does not make data $more$ normal, it will just change the mean and the standard error!
features_final["DIS"] = (logdis-np.mean(logdis))/np.sqrt(np.var(logdis))
scaler = StandardScaler()
scaler.fit(data['column'])
When performing mean normalization, you use the following formula: $$x' = \dfrac{x - \text{mean}(x)}{\max(x)-\min(x)}$$ The distribution will have values between -1 and 1, and a mean of 0.
features_final["LSTAT"] = (loglstat-np.mean(loglstat))/(max(loglstat)-min(loglstat))
When performing unit vector transformations, you can create a new variable x' with a range [0,1]: $$x'= \dfrac{x}{{||x||}}$$ Recall that the norm of x $||x||= \sqrt{(x_1^2+x_2^2+...+x_n^2)}$
from sklearn.preprocessing import RobustScaler, QuantileTransformer, PowerTransformer
ss = StandardScaler()
rs = RobustScaler() #better for data with outliers
qt = QuantileTransformer(output_distribution='normal',n_quantiles=1000) #best with uniform or bimodal distribution
yj = PowerTransformer(method = 'yeo-johnson') #best with categorical and ordinal data
bc = PowerTransformer(method = 'box-cox') #only works with positive values
can be removed using the mean and standard deviation > 3
# Find the mean and standard dev
std = so_numeric_df['ConvertedSalary'].std()
mean = so_numeric_df['ConvertedSalary'].mean()
# Calculate the cutoff
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
# Trim the outliers
trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
& (so_numeric_df['ConvertedSalary'] > lower)]
# The trimmed box plot
trimmed_df[['ConvertedSalary']].boxplot()
plt.show()
-vectorizing text to store values into a dataset
# Take the title text
title_text = volunteer['title']
# Create the vectorizer method
tfidf_vec = TfidfVectorizer()
# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)text
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)
# Fit the model to the training data
nb.fit(X_train, y_train)
# Print out the model's accuracy
print(nb.score(X_test, y_test))
using regex to extract certain string characters
# Write a pattern to extract numbers and decimals
def return_mileage(length):
pattern = re.compile(r"\d+\.\d+")
# Search the text for matches
mile = re.match(pattern, length)
# If a value is returned, use group(0) to return the found value
if mile is not None:
return float(mile.group(0))
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())
changing format based on relevance
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer['start_date_date'])
# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].apply(lambda row: row.month)
# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())
aggregating multiple columns into a single column
# Create a list of the columns to average
run_columns = ['run1', 'run2', 'run3'
, 'run4', 'run5']
# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)
# Take a look at the results
print(running_times_5k)
'''Distance equation for long,lat data used via stackoverflow from user Michael0x2a.
Updated to a function that converts to mileage'''
# constant values, if need to change end lat, long points, change the lat2, lon2 information
lat2 = np.array(clean.Latitude)
lon2 = np.array(clean.Longitude)
latr = np.array(list(map(lambda x: np.radians(x), lat2)))
lonr = np.array(list(map(lambda x: np.radians(x), lon2)))
def distance(lat1,lon1):
lat1 = np.radians(lat1)
lon1 = np.radians(lon1)
dlon = np.array(list(map(lambda x: (x - lon1), lonr)))
dlat = np.array(list(map(lambda x: (x - lat1), latr)))
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
# 6373.0 represents earth radius in kilometers
kilo = 6373.0 * c
miles = kilo * 0.62137119
return miles
In statistics, an interaction is a particular property of three or more variables, where two or more variables interact in a non-additive manner when affecting a third variable. In other words, the two variables interact to have an effect that is more (or less) than the sum of their parts. Not accounting for them might lead to results that are wrong. You'll also notice that including them when they're needed will increase your R2R2 value!
from itertools import combinations
combinations = list(combinations(data.feature_names, 2))
interactions = []
data = df.copy()
for comb in combinations:
data["interaction"] = data[comb[0]] * data[comb[1]]
score = np.mean(cross_val_score(regression, data, y, scoring="r2", cv=crossvalidation))
if score > baseline: interactions.append((comb[0], comb[1], round(score,3)))
print("Top 3 interactions: %s" %sorted(interactions, key=lambda inter: inter[2], reverse=True)[:5])
df_inter = df.copy()#make a copy of dataframe so original is not affected
df_inter["RM_LSTAT"] = df["RM"] * df["LSTAT"] #combines the two features
df_inter["RM_TAX"] = df["RM"] * df["TAX"]
df_inter["RM_RAD"] = df["RM"] * df["RAD"]
When relationships between predictors and outcome are not linear and show some sort of a curvature, polynomials can be used to generate better approximations. The idea is that you can transform your input variable by e.g, squaring it.
$\hat y = \hat \beta_0 + \hat \beta_1x + \hat \beta_2 x^2$
The use of polynomials is not restricted to quadratic relationships, you can explore cubic relationships,... as well! Imagine you want to go until the power of 10, it would be quite annoying to transform your variable 9 times. Of course, Scikit-Learn has a built-in Polynomial option in the preprocessing library!
for index, degree in enumerate([2,3,4]):
poly = PolynomialFeatures(degree)
X = poly.fit_transform(X)
X_plot = poly.fit_transform(X_plot)
reg_poly = LinearRegression().fit(X, y)
y_plot = reg_poly.predict(X_plot)
plt.plot(x_plot, y_plot, color=colors[index], linewidth = 2 ,
label="degree %d" % degree)
print("degree %d" % degree, r2_score(y, reg_poly.predict(X)))
plt.legend(loc='lower left')
plt.show();