1. Resampling Imbalanced Data¶

In [2]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [30]:

df = pd.read_csv('creditcard_small.csv')
# df = df.loc[df.Class==1, :].append(df.loc[df.Class==0, :].sample(5000))
X = df.iloc[:,:-1]
y = df.iloc[:,-1].map({1:'Fraud', 0:'No Fraud'})

In [8]:

sns.scatterplot(X.V4, X.V6, y, alpha=.5, legend=False)
# plt.savefig("distribution.png", dpi=300)

In [4]:

from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
X_resampled, y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X, y)
X_resampled, y_resampled = SMOTE(sampling_strategy={"Fraud":1000}).fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)

In [9]:

sns.scatterplot(X_resampled.V4, X_resampled.V6, y_resampled, alpha=.5, legend=False)
# plt.savefig("resampled.png", dpi=300)

2. Creating New Features¶

In [39]:

turnover_df = pd.read_csv('turnover.csv')
turnover_df.head()

Out[39]:

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

In [53]:

import featuretools as ft
import pandas as pd

# Create Entity
turnover_df = pd.read_csv('turnover.csv')
es = ft.EntitySet(id = 'Turnover')
es.entity_from_dataframe(entity_id = 'hr', dataframe = turnover_df, index = 'index')

# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'hr',
                                      trans_primitives = ['add_numeric', 'multiply_numeric'], verbose=True)

2019-08-09 10:23:23,030 featuretools.entityset - WARNING    index index not found in dataframe, creating new integer column
Built 668 features
Elapsed: 00:12 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 11/11 chunks

In [58]:

feature_matrix[feature_matrix.columns[10:20]].head()

Out[58]:

	average_montly_hours + last_evaluation	number_project + promotion_last_5years	left + promotion_last_5years	average_montly_hours + Work_accident	last_evaluation + Work_accident	number_project + Work_accident	left + Work_accident	average_montly_hours + promotion_last_5years	satisfaction_level + Work_accident	promotion_last_5years + Work_accident
index
0	157.53	2	1	157	0.53	2	1	157	0.38	0
1	262.86	5	1	262	0.86	5	1	262	0.80	0
2	272.88	7	1	272	0.88	7	1	272	0.11	0
3	223.87	5	1	223	0.87	5	1	223	0.72	0
4	159.52	2	1	159	0.52	2	1	159	0.37	0

3. Handling Missing Values¶

In [ ]:

# explicitly require this experimental feature
# now you can import normally from sklearn.impute
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Load data
titanic = pd.read_csv("titanic.csv")
titanic = titanic.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

# Run imputer with a Random Forest estimator
imp = IterativeImputer(RandomForestRegressor(), max_iter=10, random_state=0)
titanic = pd.DataFrame(imp.fit_transform(titanic), columns=titanic.columns)

4. Outlier Detection¶

In [22]:

from sklearn.ensemble import IsolationForest
import pandas as pd
import seaborn as sns

credit_card = pd.read_csv('creditcard_small.csv').drop("Class", 1)
clf = IsolationForest(contamination=0.01, behaviour='new')
outliers = clf.fit_predict(credit_card)
sns.scatterplot(credit_card.V4, credit_card.V2, outliers, palette='Set1', legend=False)
# plt.savefig('outliers.png', dpi=300)

Out[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x23a6f9b92e8>