#!/usr/bin/env python # coding: utf-8 # In[34]: import pandas as pd import matplotlib.pyplot as plt #visualisation import seaborn as sns #visualisation import numpy as np # Step One- Data Wrangling # # 1.1 Data Loading # # Here I have loaded the dataset. To save myself from typing 'aps_failure.csv' every single time I have given the dataset a simplfied name 'afs'. Line 1 below tells the program where the data is while line 2 renames it for ease of use. # In[35]: data = pd.read_csv('aps_failure_set.csv') afs=pd.read_csv('aps_failure_set.csv') # # Exploratory Analysis. # I am gathering some very basic information on my datset so I know what I'm dealing with. I start this process with gathering basic information # In[36]: afs.shape # The afs.shape above has told me I am dealing with a datset that has 171 columns and 60,000 rows. I will now use the afs.describe(include=object) function to provide me with some basic statistics on the data. This is useful for the following reasons: # # -Count shows me that # -Unique showes me that # -Top shows me that # -Freq shows me that # In[37]: afs.head(5) # Now I begin to view the data. data.head(10) gives me the first 10 rows of the data. # # This allows me to get an understanding of what I am actually dealing with. It is a good way # In[38]: afs.tail(5) # In[39]: afs.shape # In[40]: afs.info # In[41]: afs.columns # In[42]: null_values = afs.isnull().sum() # Checking the data type # In[ ]: # In[43]: afs.describe # In[44]: nan_afs=afs.isna() print(nan_afs) # In[45]: nan_rows = afs.isna().any(axis=1) print(nan_rows) # In[ ]: # In[46]: clean_afs = afs.dropna() print(clean_afs) # I have noticed some data points say Na. I have asked how many are like this using code I found at this source: https://saturncloud.io/blog/how-to-find-all-rows-with-nan-values-in-python-pandas/#:~:text=To%20find%20all%20rows%20with%20NaN%20values%20in%20a%20Pandas,where%20NaN%20values%20are%20present. # Adding new column names as the current naming of the sensor columns is confusing. Source: # In[47]: print(afs.columns) # In[48]: afs.dtypes # In[58]: def missing_values_table(afs): mis_val = afs.apply(lambda x: x[x == 'na'].count(), axis=0) mis_val_percent = 100 * mis_val / len(afs) mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) mis_val_table_ren_columns = mis_val_table.rename( columns = {0 : 'Missing Values', 1 : '% of Total Values'} ) mis_val_table_ren_columns = mis_val_table_ren_columns[ mis_val_table_ren_columns.iloc[:,1] != 0 ].sort_values('% of Total Values', ascending=False).round(1) print("Your selected dataframe has " + str(afs.shape[1]) + " columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.") return mis_val_table_ren_columns missing_values_table(afs) # In[ ]: # Add in source for the above: # # I have decided that anything with less then 60% readable data is not useable so I will cut anything that is under 60% usuable data. This includes: # # br_000 # bq_000 # bp_000 # bo_000 # ab_000 # cr_000 # bn_000 # bm_000 # bl_000 # bk_000 # In[60]: afs.drop(columns=['br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000', 'bm_000', 'bl_000', 'bk_000'], inplace=True) # In[62]: print(afs.columns) # Source for drop columns formula: https://www.freecodecamp.org/news/dataframe-drop-column-in-pandas-how-to-remove-columns-from-dataframes/#:~:text=drop()%20Method%20in%20Pandas,the%20inplace%20parameter%20to%20True%20 # # Source for # In[57]: pd.set_option('display.max_rows', None) # In[56]: new_column_names = { "aa_000": "sensor1_reading", "ab_000": "sensor2_reading", "ac_000": "sensor3_reading", "ad_000": "sensor4_reading", "ae_000": "sensor5_reading", "af_000": "sensor6_reading", "ag_000": "sensor7_reading", "ag_001": "sensor8_reading", "ag_002": "sensor9_reading", "ee_002": "sensor10_reading", "ee_003": "sensor11_reading", "ee_004": "sensor12_reading", "ee_005": "sensor13_reading", "ee_006": "sensor14_reading", "ee_007": "sensor15_reading", "ee_008": "sensor16_reading", "ee_009": "sensor17_reading", "ef_000": "sensor18_reading", "eg_000": "sensor19_reading" } # In[ ]: # In[ ]: # In[ ]: # In[ ]: # I have notcied my first issue with the data. I have 59,000 data points for negative and only 1000 for posiitve. The code column contains two attributes, negative (neg) and positive (pos). I will change these to numercal values so I can count them. neg=0, pos=1... Source: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.htmlAs the 'neg' column is not applicable to this project, I will remove them from the data set before I explore any further. "The dataset’s positive class consists of component failures for a specific component of the APS system. # The negative class consists of trucks with failures for components not related to the APS." This data is unrelated and therefore not useful for my project. Firstly, I will change the data # Code source: https://www.w3docs.com/snippets/python/deleting-dataframe-row-in-pandas-based-on-column-value.html # In[42]: afs = afs.drop(afs[afs['class'] == 'neg'].index) # In[43]: afs.describe(include=object) # This confirms that all 'neg' values have been dropped. Source: - See method 2 'Using the drop function' https://saturncloud.io/blog/how-to-remove-rows-with-specific-values-in-pandas-dataframe/#:~:text=Another%20method%20to%20remove%20rows,value%20we%20want%20to%20remove # In[44]: afs.describe(include=object) print(afs) # Above I have ran the neg_count function to ensure that the negitive values were dropped. I then ran the describe function to confirm that the value of "class" is now "1" instead of two. Source: https://www.w3docs.com/snippets/python/deleting-dataframe-row-in-pandas-based-on-column-value.html # # # I need to get rid of the n/a in the ab_00 column. Below I will experiment with different strategies to do this. Forst, I will explore the classification and regression of the dataset. For this project, I will use multiclass classification. # In[45]: afs.shape # In[46]: #Requesting basic info on the dataset data.info() # Basic Statistical Information on the dataset # # In[47]: data.describe() # I am checking the code for blank data. Note for myself- add in why this is important from lecture notes # In[48]: afs.isnull().sum() # In[49]: print(afs.isnull().values.any()) # In[50]: afs["class"].value_counts().sort_index() # I have notcied my first issue with the data. I have 59,000 data points for negative and only 1000 for posiitve. The code column contains two attributes, negative (neg) and positive (pos). I will change these to numercal values so I can count them. neg=0, pos=1... Source: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.html # In[ ]: # In[ ]: # In[51]: # Displaying the first few rows of the 'class' column and its distribution (data['class'].head(), data['class'].value_counts(normalize=True)) # This indicates that my dataset has no missing values or invalid data types. This is a good sign as my data is 'complete' and no further action is required. # Now I will begin to visualise my data using seaborne. # In[ ]: