#!/usr/bin/env python
# coding: utf-8

# In[34]:


import pandas as pd
import matplotlib.pyplot as plt #visualisation
import seaborn as sns #visualisation
import numpy as np 


# Step One- Data Wrangling
# 
# 1.1 Data Loading
# 
# Here I have loaded the dataset. To save myself from typing 'aps_failure.csv' every single time I have given the dataset a simplfied name 'afs'. Line 1 below tells the program where the data is while line 2 renames it for ease of use. 

# In[35]:


data = pd.read_csv('aps_failure_set.csv')
afs=pd.read_csv('aps_failure_set.csv')


# 

# Exploratory Analysis. 
# I am gathering some very basic information on my datset so I know what I'm dealing with. I start this process with gathering basic information 

# In[36]:


afs.shape


# The afs.shape above has told me I am dealing with a datset that has 171 columns and 60,000 rows. I will now use the afs.describe(include=object) function to provide me with some basic statistics on the data. This is useful for the following reasons:
# 
# -Count shows me that
# -Unique showes me that
# -Top shows me that
# -Freq shows me that

# In[37]:


afs.head(5)


# Now I begin to view the data. data.head(10) gives me the first 10 rows of the data.
# 
# This allows me to get an understanding of what I am actually dealing with. It is a good way

# In[38]:


afs.tail(5)


# In[39]:


afs.shape


# In[40]:


afs.info


# In[41]:


afs.columns


# In[42]:


null_values = afs.isnull().sum()


# Checking the data type

# In[ ]:


# In[43]:


afs.describe


# In[44]:


nan_afs=afs.isna()
print(nan_afs)


# In[45]:


nan_rows = afs.isna().any(axis=1)
print(nan_rows)


# In[ ]:


# In[46]:


clean_afs = afs.dropna()
print(clean_afs)


# I have noticed some data points say Na. I have asked how many are like this using code I found at this source: https://saturncloud.io/blog/how-to-find-all-rows-with-nan-values-in-python-pandas/#:~:text=To%20find%20all%20rows%20with%20NaN%20values%20in%20a%20Pandas,where%20NaN%20values%20are%20present.

# Adding new column names as the current naming of the sensor columns is confusing. Source: 

# In[47]:


print(afs.columns)


# In[48]:


afs.dtypes


# In[58]:


def missing_values_table(afs):
    mis_val = afs.apply(lambda x: x[x == 'na'].count(), axis=0)
    mis_val_percent = 100 * mis_val / len(afs)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'}
    )
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0
    ].sort_values('% of Total Values', ascending=False).round(1)
    print("Your selected dataframe has " + str(afs.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
        " columns that have missing values.")
    return mis_val_table_ren_columns
missing_values_table(afs)


# In[ ]:


# Add in source for the above:
# 
# I have decided that anything with less then 60% readable data is not useable so I will cut anything that is under 60% usuable data. This includes: 
# 
# br_000
# bq_000
# bp_000
# bo_000
# ab_000
# cr_000
# bn_000
# bm_000
# bl_000
# bk_000

# In[60]:


afs.drop(columns=['br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000', 'bm_000', 'bl_000', 'bk_000'], inplace=True)


# In[62]:


print(afs.columns)


# Source for drop columns formula: https://www.freecodecamp.org/news/dataframe-drop-column-in-pandas-how-to-remove-columns-from-dataframes/#:~:text=drop()%20Method%20in%20Pandas,the%20inplace%20parameter%20to%20True%20
# 
# Source for 

# In[57]:


pd.set_option('display.max_rows', None)


# In[56]:


new_column_names = {
    "aa_000": "sensor1_reading",
    "ab_000": "sensor2_reading",
    "ac_000": "sensor3_reading",
    "ad_000": "sensor4_reading",
    "ae_000": "sensor5_reading",
    "af_000": "sensor6_reading",
    "ag_000": "sensor7_reading",
    "ag_001": "sensor8_reading",
    "ag_002": "sensor9_reading",
    "ee_002": "sensor10_reading",
    "ee_003": "sensor11_reading",
    "ee_004": "sensor12_reading",
    "ee_005": "sensor13_reading",
    "ee_006": "sensor14_reading",
    "ee_007": "sensor15_reading",
    "ee_008": "sensor16_reading",
    "ee_009": "sensor17_reading",
    "ef_000": "sensor18_reading",
    "eg_000": "sensor19_reading"
}


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# I have notcied my first issue with the data. I have 59,000 data points for negative and only 1000 for posiitve. The code column contains two attributes, negative (neg) and positive (pos). I will change these to numercal values so I can count them. neg=0, pos=1... Source: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.htmlAs the 'neg' column is not applicable to this project, I will remove them from the data set before I explore any further. "The dataset’s  positive class consists of component failures for a specific component of the APS system.
# The negative class consists of trucks with failures for components not related to the APS." This data is unrelated and therefore not useful for my project. Firstly, I will change the data

# Code source: https://www.w3docs.com/snippets/python/deleting-dataframe-row-in-pandas-based-on-column-value.html

# In[42]:


afs = afs.drop(afs[afs['class'] == 'neg'].index)


# In[43]:


afs.describe(include=object)


# This confirms that all 'neg' values have been dropped. Source: - See method 2 'Using the drop function' https://saturncloud.io/blog/how-to-remove-rows-with-specific-values-in-pandas-dataframe/#:~:text=Another%20method%20to%20remove%20rows,value%20we%20want%20to%20remove

# In[44]:


afs.describe(include=object)
print(afs)


# Above I have ran the neg_count function to ensure that the negitive values were dropped. I then ran the describe function to confirm that the value of "class" is now "1" instead of two. Source: https://www.w3docs.com/snippets/python/deleting-dataframe-row-in-pandas-based-on-column-value.html
# 
# 

# I need to get rid of the n/a in the ab_00 column. Below I will experiment with different strategies to do this. Forst, I will explore the classification and regression of the dataset. For this project, I will use multiclass classification. 

# In[45]:


afs.shape


# In[46]:


#Requesting basic info on the dataset
data.info()


# Basic Statistical Information on the dataset
# 

# In[47]:


data.describe()


# I am checking the code for blank data. Note for myself- add in why this is important from lecture notes

# In[48]:


afs.isnull().sum()


# In[49]:


print(afs.isnull().values.any())


# In[50]:


afs["class"].value_counts().sort_index()


# I have notcied my first issue with the data. I have 59,000 data points for negative and only 1000 for posiitve. The code column contains two attributes, negative (neg) and positive (pos). I will change these to numercal values so I can count them. neg=0, pos=1... Source: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.html

# In[ ]:


# In[ ]:


# In[51]:


# Displaying the first few rows of the 'class' column and its distribution
(data['class'].head(), data['class'].value_counts(normalize=True))


# This indicates that my dataset has no missing values or invalid data types. This is a good sign as my data is 'complete' and no further action is required. 

# Now I will begin to visualise my data using seaborne. 

# In[ ]: