!pip install -I numpy==1.19.2
!pip install snowflake-connector-python
import warnings
warnings.filterwarnings("ignore")
!pip install -I pyarrow==5.0.0
# import basic data science libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import snowflake.connector
import getpass
# using a simpler way to use your login info without embedding it in the notebook
# other enterprise connection patterns (e.g., SSO) are in the Snowflake docs: https://docs.snowflake.com/en/user-guide/python-connector-example.html
snowflake_username = getpass.getpass("Enter Snowflake Username")
snowflake_pwd = getpass.getpass("Enter Snowflake Password")
snowflake_acct = 'nna57244.us-east-1'
print(snowflake_username)
print(snowflake_acct)
# Gets the version
ctx = snowflake.connector.connect(
user=snowflake_username,
password=snowflake_pwd,
account=snowflake_acct
)
cs = ctx.cursor()
try:
cs.execute("SELECT current_version()")
one_row = cs.fetchone()
print(one_row[0])
#cs.cursor().execute("USE WAREHOUSE tiny_warehouse_mg")
cs.execute("USE DATABASE PREDICTIVE_MAINTENANCE")
#cs.execute("SELECT count(TYPE) from RAW_DEVICE_DATA where MACHINE_FAILURE = 1;")
#one_row = cs.fetchone()
#print("Records with Failures: " + str(one_row[0]))
query_output = cs.execute( "select TYPE, AIR_TEMPERATURE, PROCESS_TEMPERATURE, ROTATIONAL_SPEED, TORQUE, TOOL_WEAR, MACHINE_FAILURE from SUMMARY_SENSOR_DATA;" )
df = query_output.fetch_pandas_all() #.to_csv("/path/to/write/table.csv")
#df.to_csv("./data/full_snowflake_dataset.csv", index=False)
print( df )
finally:
cs.close()
ctx.close()
print( df.info() )
# map categorical variable 'diagnosis' into numeric
df["TYPE"] = df["TYPE"].map({'H': 2, 'M': 1, 'L': 0})
df.head()
# drop redundant columns / unneeded columns
# df.drop('id', axis=1, inplace=True)
'''
df.drop('UDI', axis=1, inplace=True)
df.drop('TWF', axis=1, inplace=True)
df.drop('HDF', axis=1, inplace=True)
df.drop('PWF', axis=1, inplace=True)
df.drop('OSF', axis=1, inplace=True)
df.drop('RNF', axis=1, inplace=True)
'''
# however, we took care of dropping these columns by simply not selecting them from snowflake
df.info()
df_num = df.select_dtypes(include = ['float64', 'int64', 'int8'])
print( df_num.info() )
# check for NA values
df_num.isna().any()
# look at the mean of different features for both the classes to see which features capture the variability
df_num.groupby('MACHINE_FAILURE').mean()
df_normal = df_num[df_num['MACHINE_FAILURE'] == 0]
df_failure = df_num[df_num['MACHINE_FAILURE'] == 1]
total_records = len(df_num.index)
total_normal_records = len(df_normal.index)
total_failure_records = len(df_failure.index)
percent_failure = total_failure_records / total_records
print("Total records in dataset: " + str(total_records))
print("Total normal records in dataset: " + str(total_normal_records))
print("Total failure records in dataset: " + str(total_failure_records))
print("Percent failure records in dataset: " + str(percent_failure))
# Plot frequency distributions of the features to visualize which features capture the most variability
fig = plt.figure(figsize=(10,20))
for i,col_name in enumerate(list(df_num.columns[0:30])):
i +=1
ax = fig.add_subplot(10,3,i)
sns.distplot(df_normal[col_name], label = 'Normal', kde = True, color= 'b')
sns.distplot(df_failure[col_name], label= 'Failure', kde = True, color= 'r')
ax.set_title(col_name)
sns.set_style("whitegrid")
plt.tight_layout()
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
plt.show()
# calculate correlation matrix
corr = df_num.corr()# plot the heatmap
# , cmap=sns.diverging_palette(220, 20, as_cmap=True)
corr_heatmap = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt='.2f', annot_kws={'size': 12})
numerical = [
'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR'
]
categorical = [
'TYPE'
]
#housing = housing[numerical + categorical]
#housing.shape
fig, ax = plt.subplots(2, 3, figsize=(15, 10))
#for var, subplot in zip(categorical, ax.flatten()):
# sns.boxplot(x='MACHINE_FAILURE', y='MACHINE_FAILURE', data=housing, ax=subplot)
# sns.boxplot(x='MACHINE_FAILURE', y='AIR_TEMPERATURE', data=df_num, ax=subplot)
for var, subplot in zip(numerical, ax.flatten()):
sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True)
fig2, ax2 = plt.subplots()
sns.boxplot(x="AIR_TEMPERATURE", y="MACHINE_FAILURE", data=df_num, orient="h", palette={1:"red", 0:"blue"}, ax=ax2, notch=True)
ax2.get_yaxis().set_visible(False)
fig2.suptitle("Machine Failure vs Air Temperature")
color_patches = [
Patch(facecolor="blue", label="no failure"),
Patch(facecolor="red", label="failure")
]
ax2.legend(handles=color_patches);
df_normal = df_num[df_num['MACHINE_FAILURE'] == 0]
df_failure = df_num[df_num['MACHINE_FAILURE'] == 1]
type_count_normal_series = df_normal['TYPE'].value_counts()
print( type_count_normal_series )
type_count_fail_series = df_failure['TYPE'].value_counts()
print( type_count_fail_series )
type_0_ratio = type_count_fail_series.get(0) / type_count_normal_series.get(0)
type_1_ratio = type_count_fail_series.get(1) / type_count_normal_series.get(1)
type_2_ratio = type_count_fail_series.get(2) / type_count_normal_series.get(2)
print( type_0_ratio )
print( type_1_ratio )
print( type_2_ratio )
# need to account for the [ Type, Machine failure ] comparison
fig3, ax3 = plt.subplots()
sns.catplot(x="TYPE", hue="MACHINE_FAILURE", data=df_num, kind="count", palette={0:"blue", 1:"red"}, ax=ax3)
plt.close(1) # catplot creates an extra figure we don't need
#ax.legend(title="Machine Performance")
#ax.set_xticklabels(["Failure", "Normal"])
#ax.set_xlabel("Machine Type")
fig3.suptitle("Normal Operations vs Failure: Predictive Maintenance");
df_num.head(200)
def reduce_C_function_ratio(C):
all_count = len(C)
one_count = np.sum(C)
return one_count / float(all_count);
y = df_num['TORQUE']
x = df_num['ROTATIONAL_SPEED']
xmin = x.min()
xmax = x.max()
ymin = y.min()
ymax = y.max()
fig, axs = plt.subplots(ncols=1, figsize=(12, 10)) #sharey=True,
fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)
ax = axs
hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='inferno', reduce_C_function=reduce_C_function_ratio) #np.sum)
ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
ax.set_title("Rotational Speed vs Torque - Ratios")
cb = fig.colorbar(hb, ax=ax)
cb.set_label('Ratio')
from itertools import chain, combinations
numerical = [
'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR'
]
categorical = [
'TYPE'
]
fig, ax = plt.subplots(3, 3, figsize=(25, 20))
var_combinations = [(comb) for comb in combinations(numerical, 2)]
for var_c, subplot in zip(var_combinations, ax.flatten()):
var_a, var_b = var_c
#print( var_a)
y = df_num[var_a]
x = df_num[var_b]
xmin = x.min()
xmax = x.max()
ymin = y.min()
ymax = y.max()
#fig, axs = plt.subplots(ncols=1, figsize=(12, 10)) #sharey=True,
#fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)
ax = subplot
hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='OrRd', reduce_C_function=reduce_C_function_ratio) #np.sum)
ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
ax.set_title(var_a + " vs " + var_b + " - Ratios")
cb = fig.colorbar(hb, ax=ax)
cb.set_label('Ratio')
#sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True)