#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from io import StringIO afs=pd.read_csv('aps_failure_set.csv') # In[2]: afs.head() # In[3]: afs.tail() # In[4]: df = pd.read_csv('aps_failure_set.csv') df_filtered = df[df['class'] != 'neg'] df_filtered.to_csv('afs-filtered.csv', index=False) # In[5]: df_filtered.info() # In[6]: df_filtered.describe() # In[7]: df_filtered.shape # In[ ]: # In[8]: df_filtered.dtypes # In[9]: df_filtered['aa_000'] = df_filtered['aa_000'].astype('float64') # In[ ]: # In[10]: df_filtered.dtypes # In[11]: print(df_filtered.columns) # In[12]: # Find the indices of the columns named 'aa_000' duplicate_column_indices = [i for i, col in enumerate(afs.columns) if col == 'aa_000'] # In[ ]: # In[ ]: # In[13]: print('az_003') df_filtered['az_003'].head(10) # In[14]: import numpy as np missing_value_formats = ["n.a.","?","NA","n/a", "na"] df_filtered.replace(missing_value_formats, np.nan, inplace=True) print(df_filtered['ac_000'].head(10)) # In[15]: df_filtered.isnull().values.any() # In[16]: (df_filtered.isnull().sum()/(len(df_filtered)))*100 # In[17]: df_filtered.isnull().sum() # In[18]: pd.set_option('display.max_rows', None) missing_values = df_filtered.isna().sum() print(missing_values) # In[19]: df_filtered = afs.dropna(axis=0) df_filtered.isnull().values.any() # In[20]: print(df_filtered.columns) print(any(df_filtered.columns.duplicated())) df_filtered.shape # In[ ]: # In[36]: sorted_missing_values = missing_values.sort_values(ascending=False) print(sorted_missing_values) # In[37]: total_rows = df_filtered.shape[0] missing_values_percentage = (missing_values / total_rows) * 100 sorted_missing_values_percentage = missing_values_percentage.sort_values(ascending=False) print(sorted_missing_values_percentage) # In[24]: print(df_filtered.columns) # In[38]: cols_to_remove = [ "cr_000", "ab_000", "ad_000", "cg_000", "ch_000", "cf_000", "co_000", "cv_000", "da_000", "cz_000", "cy_000", "ak_000", "cx_000", "cu_000", "ct_000", "db_000", "dc_000", "ca_000", "dg_000", "di_000", "dj_000", "dk_000", "dl_000", "dm_000", "dh_000", "df_000", "eb_000", "ac_000", "de_000", "dp_000", "cp_000", "bc_000", "bd_000", "ef_000", "bz_000", "do_000", "eg_000", "dq_000", "ar_000", "ea_000", "dz_000", "dx_000", "dv_000", "du_000", "dt_000", "dy_000", "dr_000", "ds_000", "ce_000", "av_000", "dd_000", "bf_000", "ae_000", "be_000", "af_000", "ax_000", "ec_00", "cm_000", "ed_000", "cl_000" ] df_filtered.drop(columns=cols_to_remove, inplace=True) # # I have decided that anything with less then 70% readable data is not useable so I will cut anything that is under 70% usuable data. This includes: # # br_000 # bq_000 # bp_000 # bo_000 # ab_000 # cr_000 # bn_000 # bm_000 # bl_000 # bk_000 # ad_000 # ch_000 # co_000 # cf_000 # cg_000 # db_000 # ct_000 # cu_000 # cv_000 # cx_000 # cy_000 # cz_000 # da_000 # dc_000 # ec_00 # cm_000 # ed_000 # cl_000 # # In[ ]: # In[39]: df_filtered = df[df['class'] != 'neg'] # In[46]: df_filtered.head(50) # In[50]: print(df_filtered.head().columns) # In[51]: df_filtered.shape # In[ ]: # In[52]: missing_value_formats = ["n.a.", "?", "NA", "n/a", "na", "na"] df_filtered.replace(missing_value_formats, np.nan, inplace=True) df_filtered.fillna(df_filtered.mean(), inplace=True) scaler = MinMaxScaler() numeric_columns = df_filtered.select_dtypes(include=[np.number]).columns df_filtered[numeric_columns] = scaler.fit_transform(df_filtered[numeric_columns]) # In[ ]: # In[ ]: # In[ ]: # In[53]: (df_filtered['class'].head(), df_filtered['class'].value_counts(normalize=True)) # In[54]: df_filtered.describe(include=object) # In[55]: df_filtered.describe(include=object) print(df_filtered) # In[ ]: # In[56]: df_filtered.shape # In[57]: df_filtered.info() # In[58]: df_filtered.describe() # In[59]: df_filtered.isnull().values.any() # In[60]: df_filtered.isnull().sum() # In[61]: df_filtered["class"].value_counts().sort_index() # In[62]: df_filtered = df_filtered[df_filtered['class'] != 'neg'] # In[63]: df_filtered["class"].value_counts().sort_index() # In[64]: df_filtered.info # In[65]: df_filtered.count # In[122]: df_filtered = df_filtered[df_filtered.drop("class", axis=1).replace(0, np.nan).notna().any(axis=1)] # In[123]: from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy='mean') df_filtered_encoded_imputed = imputer.fit_transform(df_filtered_encoded) # In[124]: sns.boxplot(x=df_filtered["aa_000"]) # In[132]: df_filtered['ab_000'] = pd.to_numeric(df_filtered['ab_000'], errors='coerce') sns.boxplot(x=df_filtered["ab_000"]) # In[134]: df_filtered['ag_000'] = pd.to_numeric(df_filtered['ag_000'], errors='coerce') sns.boxplot(x=df_filtered["ag_000"]) # In[135]: df_filtered['ag_002'] = pd.to_numeric(df_filtered['ag_002'], errors='coerce') sns.boxplot(x=df_filtered["ag_002"]) # In[136]: df_filtered['ag_003'] = pd.to_numeric(df_filtered['ag_003'], errors='coerce') sns.boxplot(x=df_filtered["ag_003"]) # In[137]: df_filtered['ag_004'] = pd.to_numeric(df_filtered['ag_004'], errors='coerce') sns.boxplot(x=df_filtered["ag_004"]) # In[138]: df_filtered['ag_005'] = pd.to_numeric(df_filtered['ag_005'], errors='coerce') sns.boxplot(x=df_filtered["ag_005"]) # In[139]: df_filtered['ag_007'] = pd.to_numeric(df_filtered['ag_007'], errors='coerce') sns.boxplot(x=df_filtered["ag_007"]) # In[ ]: # In[140]: df_filtered['ee_000'] = pd.to_numeric(df_filtered['ee_000'], errors='coerce') sns.boxplot(x=df_filtered["ee_000"]) # In[141]: df_filtered['ee_001'] = pd.to_numeric(df_filtered['ee_001'], errors='coerce') sns.boxplot(x=df_filtered["ee_001"]) # In[142]: df_filtered['ee_002'] = pd.to_numeric(df_filtered['ee_002'], errors='coerce') sns.boxplot(x=df_filtered["ee_002"]) # In[143]: df_filtered['ee_003'] = pd.to_numeric(df_filtered['ee_003'], errors='coerce') sns.boxplot(x=df_filtered["ee_003"]) # In[144]: Q1 = df_filtered.quantile(0.25) Q3 = df_filtered.quantile(0.75) IQR = Q3-Q1 print(IQR) # In[ ]: # In[145]: df_filtered.count() # In[146]: df = df_filtered.dropna() # In[147]: df_filtered.count() # In[ ]: # In[148]: df_filtered.shape # In[149]: plt.figure(figsize=(40,20)) c= df_filtered.corr() sns.heatmap(c,cmap="BrBG",annot=True) c # In[150]: colnames = [ 'aa_000', 'ag_001', 'ag_002', 'ag_003', 'ag_004', 'ag_005', 'ag_006', 'ag_007', 'ag_008', 'ag_009', 'ah_000', 'ai_000', 'al_000', 'am_0', 'an_000', 'ao_000', 'ap_000', 'aq_000', 'at_000', 'ay_006', 'ay_007', 'ay_008', 'ay_009', 'az_000', 'az_001', 'az_002', 'az_003', 'az_004', 'az_005', 'az_006', 'az_007', 'az_008', 'ba_000', 'ba_001', 'ba_002', 'ba_003', 'ba_004', 'ba_005', 'ba_006', 'ba_007', 'ba_008', 'ba_009', 'bb_000', 'bg_000', 'bh_000', 'bi_000', 'bj_000', 'bs_000', 'bt_000', 'bu_000', 'bv_000', 'by_000', 'cb_000', 'ci_000', 'cj_000', 'ck_000', 'cn_000', 'cn_001', 'cn_002', 'cn_003', 'cn_004', 'cn_005', 'cn_006', 'cn_007', 'cn_008', 'cn_009', 'cq_000', 'cs_000', 'cs_001', 'cs_002', 'cs_003', 'cs_004', 'cs_005', 'cs_006', 'cs_007', 'cs_008', 'dn_000', 'ee_000', 'ee_001', 'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008' ] # In[ ]: # In[151]: print(df_filtered.columns) # In[167]: pd.set_option('display.max_columns', None) # In[ ]: # In[159]: from sklearn.impute import SimpleImputer import matplotlib.pyplot as plt from sklearn.decomposition import PCA pca = PCA().fit(df_filtered_encoded_imputed) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show() # In[177]: data_no_label = data_no_label.dropna() # In[178]: pca = PCA(4) projected = pca.fit_transform(data_no_label) # In[179]: print(df_filtered[non_numeric_cols].sample(5)) # In[180]: df_filtered.describe # In[181]: projected.shape # In[182]: import matplotlib.pyplot as plt plt.scatter(projected[:, 0], projected[:, 1]) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title('PCA Result: First Two Principal Components') plt.show() # In[194]: y = df_filtered["class"].values[:76] # In[195]: from sklearn.model_selection import train_test_split X = projected X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1) # In[191]: from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report model = LogisticRegression() model.fit(X_train, Y_train) Y_pred = model.predict(X_validation) accuracy = accuracy_score(Y_validation, Y_pred) print("Accuracy:", accuracy) print(classification_report(Y_validation, Y_pred)) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: