!pip install -I numpy==1.19.2 !pip install snowflake-connector-python import warnings warnings.filterwarnings("ignore") !pip install -I pyarrow==5.0.0 # import basic data science libraries import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D import snowflake.connector import getpass # using a simpler way to use your login info without embedding it in the notebook # other enterprise connection patterns (e.g., SSO) are in the Snowflake docs: https://docs.snowflake.com/en/user-guide/python-connector-example.html snowflake_username = getpass.getpass("Enter Snowflake Username") snowflake_pwd = getpass.getpass("Enter Snowflake Password") snowflake_acct = 'nna57244.us-east-1' print(snowflake_username) print(snowflake_acct) # Gets the version ctx = snowflake.connector.connect( user=snowflake_username, password=snowflake_pwd, account=snowflake_acct ) cs = ctx.cursor() try: cs.execute("SELECT current_version()") one_row = cs.fetchone() print(one_row[0]) #cs.cursor().execute("USE WAREHOUSE tiny_warehouse_mg") cs.execute("USE DATABASE PREDICTIVE_MAINTENANCE") #cs.execute("SELECT count(TYPE) from RAW_DEVICE_DATA where MACHINE_FAILURE = 1;") #one_row = cs.fetchone() #print("Records with Failures: " + str(one_row[0])) query_output = cs.execute( "select TYPE, AIR_TEMPERATURE, PROCESS_TEMPERATURE, ROTATIONAL_SPEED, TORQUE, TOOL_WEAR, MACHINE_FAILURE from SUMMARY_SENSOR_DATA;" ) df = query_output.fetch_pandas_all() #.to_csv("/path/to/write/table.csv") #df.to_csv("./data/full_snowflake_dataset.csv", index=False) print( df ) finally: cs.close() ctx.close() print( df.info() ) # map categorical variable 'diagnosis' into numeric df["TYPE"] = df["TYPE"].map({'H': 2, 'M': 1, 'L': 0}) df.head() # drop redundant columns / unneeded columns # df.drop('id', axis=1, inplace=True) ''' df.drop('UDI', axis=1, inplace=True) df.drop('TWF', axis=1, inplace=True) df.drop('HDF', axis=1, inplace=True) df.drop('PWF', axis=1, inplace=True) df.drop('OSF', axis=1, inplace=True) df.drop('RNF', axis=1, inplace=True) ''' # however, we took care of dropping these columns by simply not selecting them from snowflake df.info() df_num = df.select_dtypes(include = ['float64', 'int64', 'int8']) print( df_num.info() ) # check for NA values df_num.isna().any() # look at the mean of different features for both the classes to see which features capture the variability df_num.groupby('MACHINE_FAILURE').mean() df_normal = df_num[df_num['MACHINE_FAILURE'] == 0] df_failure = df_num[df_num['MACHINE_FAILURE'] == 1] total_records = len(df_num.index) total_normal_records = len(df_normal.index) total_failure_records = len(df_failure.index) percent_failure = total_failure_records / total_records print("Total records in dataset: " + str(total_records)) print("Total normal records in dataset: " + str(total_normal_records)) print("Total failure records in dataset: " + str(total_failure_records)) print("Percent failure records in dataset: " + str(percent_failure)) # Plot frequency distributions of the features to visualize which features capture the most variability fig = plt.figure(figsize=(10,20)) for i,col_name in enumerate(list(df_num.columns[0:30])): i +=1 ax = fig.add_subplot(10,3,i) sns.distplot(df_normal[col_name], label = 'Normal', kde = True, color= 'b') sns.distplot(df_failure[col_name], label= 'Failure', kde = True, color= 'r') ax.set_title(col_name) sns.set_style("whitegrid") plt.tight_layout() handles, labels = ax.get_legend_handles_labels() fig.legend(handles, labels, loc='upper right') plt.show() # calculate correlation matrix corr = df_num.corr()# plot the heatmap # , cmap=sns.diverging_palette(220, 20, as_cmap=True) corr_heatmap = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt='.2f', annot_kws={'size': 12}) numerical = [ 'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR' ] categorical = [ 'TYPE' ] #housing = housing[numerical + categorical] #housing.shape fig, ax = plt.subplots(2, 3, figsize=(15, 10)) #for var, subplot in zip(categorical, ax.flatten()): # sns.boxplot(x='MACHINE_FAILURE', y='MACHINE_FAILURE', data=housing, ax=subplot) # sns.boxplot(x='MACHINE_FAILURE', y='AIR_TEMPERATURE', data=df_num, ax=subplot) for var, subplot in zip(numerical, ax.flatten()): sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True) fig2, ax2 = plt.subplots() sns.boxplot(x="AIR_TEMPERATURE", y="MACHINE_FAILURE", data=df_num, orient="h", palette={1:"red", 0:"blue"}, ax=ax2, notch=True) ax2.get_yaxis().set_visible(False) fig2.suptitle("Machine Failure vs Air Temperature") color_patches = [ Patch(facecolor="blue", label="no failure"), Patch(facecolor="red", label="failure") ] ax2.legend(handles=color_patches); df_normal = df_num[df_num['MACHINE_FAILURE'] == 0] df_failure = df_num[df_num['MACHINE_FAILURE'] == 1] type_count_normal_series = df_normal['TYPE'].value_counts() print( type_count_normal_series ) type_count_fail_series = df_failure['TYPE'].value_counts() print( type_count_fail_series ) type_0_ratio = type_count_fail_series.get(0) / type_count_normal_series.get(0) type_1_ratio = type_count_fail_series.get(1) / type_count_normal_series.get(1) type_2_ratio = type_count_fail_series.get(2) / type_count_normal_series.get(2) print( type_0_ratio ) print( type_1_ratio ) print( type_2_ratio ) # need to account for the [ Type, Machine failure ] comparison fig3, ax3 = plt.subplots() sns.catplot(x="TYPE", hue="MACHINE_FAILURE", data=df_num, kind="count", palette={0:"blue", 1:"red"}, ax=ax3) plt.close(1) # catplot creates an extra figure we don't need #ax.legend(title="Machine Performance") #ax.set_xticklabels(["Failure", "Normal"]) #ax.set_xlabel("Machine Type") fig3.suptitle("Normal Operations vs Failure: Predictive Maintenance"); df_num.head(200) def reduce_C_function_ratio(C): all_count = len(C) one_count = np.sum(C) return one_count / float(all_count); y = df_num['TORQUE'] x = df_num['ROTATIONAL_SPEED'] xmin = x.min() xmax = x.max() ymin = y.min() ymax = y.max() fig, axs = plt.subplots(ncols=1, figsize=(12, 10)) #sharey=True, fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93) ax = axs hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='inferno', reduce_C_function=reduce_C_function_ratio) #np.sum) ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax)) ax.set_title("Rotational Speed vs Torque - Ratios") cb = fig.colorbar(hb, ax=ax) cb.set_label('Ratio') from itertools import chain, combinations numerical = [ 'AIR_TEMPERATURE', 'PROCESS_TEMPERATURE', 'ROTATIONAL_SPEED', 'TORQUE', 'TOOL_WEAR' ] categorical = [ 'TYPE' ] fig, ax = plt.subplots(3, 3, figsize=(25, 20)) var_combinations = [(comb) for comb in combinations(numerical, 2)] for var_c, subplot in zip(var_combinations, ax.flatten()): var_a, var_b = var_c #print( var_a) y = df_num[var_a] x = df_num[var_b] xmin = x.min() xmax = x.max() ymin = y.min() ymax = y.max() #fig, axs = plt.subplots(ncols=1, figsize=(12, 10)) #sharey=True, #fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93) ax = subplot hb = ax.hexbin(x, y, C=df_num['MACHINE_FAILURE'], gridsize=50, cmap='OrRd', reduce_C_function=reduce_C_function_ratio) #np.sum) ax.set(xlim=(xmin, xmax), ylim=(ymin, ymax)) ax.set_title(var_a + " vs " + var_b + " - Ratios") cb = fig.colorbar(hb, ax=ax) cb.set_label('Ratio') #sns.boxplot(x='TYPE', y=var, data=df_num, ax=subplot, notch=True)