In [1]:
!pip install -I numpy==1.19.2
!pip install snowflake-connector-python
import warnings
warnings.filterwarnings("ignore")
Collecting numpy==1.19.2
  Downloading numpy-1.19.2-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
     |████████████████████████████████| 14.5 MB 5.1 MB/s 
Installing collected packages: numpy
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
Successfully installed numpy-1.19.5
Collecting snowflake-connector-python
  Downloading snowflake_connector_python-2.7.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.9 MB)
     |████████████████████████████████| 14.9 MB 4.1 MB/s 
Collecting oscrypto<2.0.0
  Downloading oscrypto-1.2.1-py2.py3-none-any.whl (192 kB)
     |████████████████████████████████| 192 kB 43.5 MB/s 
Collecting pyjwt<3.0.0
  Downloading PyJWT-2.3.0-py3-none-any.whl (16 kB)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (2.10)
Collecting pycryptodomex!=3.5.0,<4.0.0,>=3.2
  Downloading pycryptodomex-3.11.0-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
     |████████████████████████████████| 1.9 MB 57.6 MB/s 
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (2021.10.8)
Collecting asn1crypto<2.0.0,>0.24.0
  Downloading asn1crypto-1.4.0-py2.py3-none-any.whl (104 kB)
     |████████████████████████████████| 104 kB 59.6 MB/s 
Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (2018.9)
Requirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (1.15.0)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (2.0.7)
Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (2.23.0)
Requirement already satisfied: setuptools>34.0.0 in /usr/local/lib/python3.7/dist-packages (from snowflake-connector-python) (57.4.0)
Collecting pyOpenSSL<21.0.0,>=16.2.0
  Downloading pyOpenSSL-20.0.1-py2.py3-none-any.whl (54 kB)
     |████████████████████████████████| 54 kB 2.0 MB/s 
Collecting cryptography<4.0.0,>=3.1.0
  Downloading cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl (3.0 MB)
     |████████████████████████████████| 3.0 MB 39.2 MB/s 
Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python) (2.21)
Requirement already satisfied: six>=1.5.2 in /usr/local/lib/python3.7/dist-packages (from pyOpenSSL<21.0.0,>=16.2.0->snowflake-connector-python) (1.15.0)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0->snowflake-connector-python) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0->snowflake-connector-python) (1.24.3)
Installing collected packages: cryptography, asn1crypto, pyOpenSSL, pyjwt, pycryptodomex, oscrypto, snowflake-connector-python
Successfully installed asn1crypto-1.4.0 cryptography-3.4.8 oscrypto-1.2.1 pyOpenSSL-20.0.1 pycryptodomex-3.11.0 pyjwt-2.3.0 snowflake-connector-python-2.7.1
In [2]:
!pip install -I pyarrow==5.0.0
Collecting pyarrow==5.0.0
  Downloading pyarrow-5.0.0-cp37-cp37m-manylinux2014_x86_64.whl (23.6 MB)
     |████████████████████████████████| 23.6 MB 1.5 MB/s 
Collecting numpy>=1.16.6
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
     |████████████████████████████████| 15.7 MB 58 kB/s 
Installing collected packages: numpy, pyarrow
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.4 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
Successfully installed numpy-1.21.4 pyarrow-5.0.0
In [3]:
# import basic data science libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import snowflake.connector
In [4]:
import getpass

# using a simpler way to use your login info without embedding it in the notebook 
# other enterprise connection patterns (e.g., SSO) are in the Snowflake docs: https://docs.snowflake.com/en/user-guide/python-connector-example.html
snowflake_username = getpass.getpass("Enter Snowflake Username")
snowflake_pwd = getpass.getpass("Enter Snowflake Password")
snowflake_acct = 'nna57244.us-east-1'

print(snowflake_username)
print(snowflake_acct)
Enter Snowflake Username··········
Enter Snowflake Password··········
xxxxxxxxx
nnaxxxxxx.us-east-1
In [5]:
# Gets the version
ctx = snowflake.connector.connect(
  user=snowflake_username,
  password=snowflake_pwd,
  account=snowflake_acct
  )
cs = ctx.cursor()
try:
	cs.execute("SELECT current_version()")
	one_row = cs.fetchone()
	print(one_row[0])



	#cs.cursor().execute("USE WAREHOUSE tiny_warehouse_mg")
	cs.execute("USE DATABASE PREDICTIVE_MAINTENANCE")    


	#cs.execute("SELECT count(TYPE) from RAW_DEVICE_DATA where MACHINE_FAILURE = 1;")
	#one_row = cs.fetchone()
	#print("Records with Failures: " + str(one_row[0]))

	query_output = cs.execute( "select TYPE, AIR_TEMPERATURE, PROCESS_TEMPERATURE, ROTATIONAL_SPEED, TORQUE, TOOL_WEAR, MACHINE_FAILURE  from SUMMARY_SENSOR_DATA;" )

	df = query_output.fetch_pandas_all() #.to_csv("/path/to/write/table.csv")	
	#df.to_csv("./data/full_snowflake_dataset.csv", index=False)	

	print( df )


finally:
    cs.close()



ctx.close()
5.41.1
     TYPE  AIR_TEMPERATURE  ...  TOOL_WEAR  MACHINE_FAILURE
0       M            298.1  ...        0.0                0
1       L            298.2  ...        3.0                0
2       L            298.1  ...        5.0                0
3       L            298.2  ...        7.0                0
4       L            298.2  ...        9.0                0
...   ...              ...  ...        ...              ...
5245    M            298.8  ...       14.0                0
5246    H            298.9  ...       17.0                0
5247    M            299.0  ...       22.0                0
5248    H            299.0  ...       25.0                0
5249    M            299.0  ...       30.0                0

[10000 rows x 7 columns]
In [6]:
print( df.info() )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 5249
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TYPE                 10000 non-null  object 
 1   AIR_TEMPERATURE      10000 non-null  float64
 2   PROCESS_TEMPERATURE  10000 non-null  float64
 3   ROTATIONAL_SPEED     10000 non-null  float64
 4   TORQUE               10000 non-null  float64
 5   TOOL_WEAR            10000 non-null  float64
 6   MACHINE_FAILURE      10000 non-null  int8   
dtypes: float64(5), int8(1), object(1)
memory usage: 556.6+ KB
None
In [7]:
# map categorical variable 'diagnosis' into numeric

df["TYPE"] = df["TYPE"].map({'H': 2, 'M': 1, 'L': 0})
df.head()
Out[7]:
TYPE AIR_TEMPERATURE PROCESS_TEMPERATURE ROTATIONAL_SPEED TORQUE TOOL_WEAR MACHINE_FAILURE
0 1 298.1 308.6 1551.0 42.8 0.0 0
1 0 298.2 308.7 1408.0 46.3 3.0 0
2 0 298.1 308.5 1498.0 49.4 5.0 0
3 0 298.2 308.6 1433.0 39.5 7.0 0
4 0 298.2 308.7 1408.0 40.0 9.0 0
In [8]:
# drop redundant columns / unneeded columns
# df.drop('id', axis=1, inplace=True)
'''
df.drop('UDI', axis=1, inplace=True)
df.drop('TWF', axis=1, inplace=True)
df.drop('HDF', axis=1, inplace=True)
df.drop('PWF', axis=1, inplace=True)
df.drop('OSF', axis=1, inplace=True)
df.drop('RNF', axis=1, inplace=True)
'''
# however, we took care of dropping these columns by simply not selecting them from snowflake
Out[8]:
"\ndf.drop('UDI', axis=1, inplace=True)\ndf.drop('TWF', axis=1, inplace=True)\ndf.drop('HDF', axis=1, inplace=True)\ndf.drop('PWF', axis=1, inplace=True)\ndf.drop('OSF', axis=1, inplace=True)\ndf.drop('RNF', axis=1, inplace=True)\n"
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 5249
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TYPE                 10000 non-null  int64  
 1   AIR_TEMPERATURE      10000 non-null  float64
 2   PROCESS_TEMPERATURE  10000 non-null  float64
 3   ROTATIONAL_SPEED     10000 non-null  float64
 4   TORQUE               10000 non-null  float64
 5   TOOL_WEAR            10000 non-null  float64
 6   MACHINE_FAILURE      10000 non-null  int8   
dtypes: float64(5), int64(1), int8(1)
memory usage: 556.6 KB
In [10]:
df_num = df.select_dtypes(include = ['float64', 'int64', 'int8'])
print( df_num.info() )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 5249
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TYPE                 10000 non-null  int64  
 1   AIR_TEMPERATURE      10000 non-null  float64
 2   PROCESS_TEMPERATURE  10000 non-null  float64
 3   ROTATIONAL_SPEED     10000 non-null  float64
 4   TORQUE               10000 non-null  float64
 5   TOOL_WEAR            10000 non-null  float64
 6   MACHINE_FAILURE      10000 non-null  int8   
dtypes: float64(5), int64(1), int8(1)
memory usage: 556.6 KB
None
In [11]:
# check for NA values
df_num.isna().any()
Out[11]:
TYPE                   False
AIR_TEMPERATURE        False
PROCESS_TEMPERATURE    False
ROTATIONAL_SPEED       False
TORQUE                 False
TOOL_WEAR              False
MACHINE_FAILURE        False
dtype: bool
In [12]:
# look at the mean of different features for both the classes to see which features capture the variability

df_num.groupby('MACHINE_FAILURE').mean()
Out[12]:
TYPE AIR_TEMPERATURE PROCESS_TEMPERATURE ROTATIONAL_SPEED TORQUE TOOL_WEAR
MACHINE_FAILURE
0 0.504917 299.973999 309.995570 1540.260014 39.629655 106.693717
1 0.368732 300.886431 310.290265 1496.486726 50.168142 143.781711
In [13]:
df_normal = df_num[df_num['MACHINE_FAILURE'] == 0]
df_failure = df_num[df_num['MACHINE_FAILURE'] == 1]

total_records = len(df_num.index)
total_normal_records = len(df_normal.index)
total_failure_records = len(df_failure.index)

percent_failure = total_failure_records / total_records

print("Total records in dataset: " + str(total_records))

print("Total normal records in dataset: " + str(total_normal_records))
print("Total failure records in dataset: " + str(total_failure_records))

print("Percent failure records in dataset: " + str(percent_failure))
Total records in dataset: 10000
Total normal records in dataset: 9661
Total failure records in dataset: 339
Percent failure records in dataset: 0.0339
In [14]:
# Plot frequency distributions of the features to visualize which features capture the most variability



fig = plt.figure(figsize=(10,20))
for i,col_name in enumerate(list(df_num.columns[0:30])):
    i +=1
    ax = fig.add_subplot(10,3,i)
    sns.distplot(df_normal[col_name], label = 'Normal', kde = True, color= 'b')
    sns.distplot(df_failure[col_name], label= 'Failure', kde = True, color= 'r')
    ax.set_title(col_name)

sns.set_style("whitegrid")
plt.tight_layout()
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
plt.show()
In [15]:
# calculate correlation matrix
corr = df_num.corr()# plot the heatmap
# , cmap=sns.diverging_palette(220, 20, as_cmap=True)
corr_heatmap = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt='.2f', annot_kws={'size': 12})
In [16]:
numerical = [
  'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR'
]
categorical = [
  'TYPE'
]

#housing = housing[numerical + categorical]
#housing.shape

fig, ax = plt.subplots(2, 3, figsize=(15, 10))

#for var, subplot in zip(categorical, ax.flatten()):
#    sns.boxplot(x='MACHINE_FAILURE', y='MACHINE_FAILURE', data=housing, ax=subplot)

# sns.boxplot(x='MACHINE_FAILURE', y='AIR_TEMPERATURE', data=df_num, ax=subplot)

for var, subplot in zip(numerical, ax.flatten()):
	
  sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True)
In [17]:
fig2, ax2 = plt.subplots()

sns.boxplot(x="AIR_TEMPERATURE", y="MACHINE_FAILURE", data=df_num, orient="h", palette={1:"red", 0:"blue"}, ax=ax2, notch=True)

ax2.get_yaxis().set_visible(False)

fig2.suptitle("Machine Failure vs Air Temperature")

color_patches = [
    Patch(facecolor="blue", label="no failure"),
    Patch(facecolor="red", label="failure")
]
ax2.legend(handles=color_patches);
In [18]:
df_normal = df_num[df_num['MACHINE_FAILURE'] == 0]
df_failure = df_num[df_num['MACHINE_FAILURE'] == 1]

type_count_normal_series = df_normal['TYPE'].value_counts() 

print( type_count_normal_series )

type_count_fail_series = df_failure['TYPE'].value_counts() 

print( type_count_fail_series )

type_0_ratio = type_count_fail_series.get(0) / type_count_normal_series.get(0)
type_1_ratio = type_count_fail_series.get(1) / type_count_normal_series.get(1)
type_2_ratio = type_count_fail_series.get(2) / type_count_normal_series.get(2)

print( type_0_ratio )
print( type_1_ratio )
print( type_2_ratio )
0    5765
1    2914
2     982
Name: TYPE, dtype: int64
0    235
1     83
2     21
Name: TYPE, dtype: int64
0.040763226366001735
0.02848318462594372
0.021384928716904276
In [19]:
# need to account for the [ Type, Machine failure ] comparison

fig3, ax3 = plt.subplots()

sns.catplot(x="TYPE", hue="MACHINE_FAILURE", data=df_num, kind="count", palette={0:"blue", 1:"red"}, ax=ax3)

plt.close(1) # catplot creates an extra figure we don't need

#ax.legend(title="Machine Performance")
#ax.set_xticklabels(["Failure", "Normal"])
#ax.set_xlabel("Machine Type")

fig3.suptitle("Normal Operations vs Failure: Predictive Maintenance");
In [20]:
df_num.head(200)
Out[20]:
TYPE AIR_TEMPERATURE PROCESS_TEMPERATURE ROTATIONAL_SPEED TORQUE TOOL_WEAR MACHINE_FAILURE
0 1 298.1 308.6 1551.0 42.8 0.0 0
1 0 298.2 308.7 1408.0 46.3 3.0 0
2 0 298.1 308.5 1498.0 49.4 5.0 0
3 0 298.2 308.6 1433.0 39.5 7.0 0
4 0 298.2 308.7 1408.0 40.0 9.0 0
... ... ... ... ... ... ... ...
195 0 298.3 308.5 1453.0 42.9 89.0 0
196 2 298.3 308.6 1432.0 49.5 91.0 0
197 1 298.4 308.6 1527.0 40.9 96.0 0
198 0 298.4 308.7 1588.0 35.0 99.0 0
199 0 298.4 308.6 1433.0 47.1 101.0 0

200 rows × 7 columns

In [21]:
def reduce_C_function_ratio(C):
  all_count = len(C)

  one_count = np.sum(C)

  return one_count / float(all_count);


y = df_num['TORQUE']
x = df_num['ROTATIONAL_SPEED']
xmin = x.min()
xmax = x.max()
ymin = y.min()
ymax = y.max()

fig, axs = plt.subplots(ncols=1,  figsize=(12, 10)) #sharey=True,

fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)
ax = axs
hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='inferno', reduce_C_function=reduce_C_function_ratio) #np.sum)
ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
ax.set_title("Rotational Speed vs Torque - Ratios")
cb = fig.colorbar(hb, ax=ax)
cb.set_label('Ratio')
In [24]:
from itertools import chain, combinations

numerical = [
  'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR'
]
categorical = [
  'TYPE'
]

fig, ax = plt.subplots(3, 3, figsize=(25, 20))

var_combinations = [(comb) for comb in combinations(numerical, 2)]


for var_c, subplot in zip(var_combinations, ax.flatten()):

  var_a, var_b = var_c
  #print( var_a)

  y = df_num[var_a]
  x = df_num[var_b]
  xmin = x.min()
  xmax = x.max()
  ymin = y.min()
  ymax = y.max()

  #fig, axs = plt.subplots(ncols=1,  figsize=(12, 10)) #sharey=True,

  #fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)
  ax = subplot
  hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='OrRd', reduce_C_function=reduce_C_function_ratio) #np.sum)
  ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax))
  ax.set_title(var_a + " vs " + var_b + " - Ratios")
  cb = fig.colorbar(hb, ax=ax)
  cb.set_label('Ratio')
	
  #sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True)