#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') from datetime import datetime, date # In[2]: # Loading the Customer Address Data from the excel file cust_address = pd.read_excel('Raw_data.xlsx' , sheet_name='CustomerAddress') # In[3]: # Checking first 5 records from Customer Address Data cust_address.head(5) # In[4]: # Information of columns and data-types of Customer Address Data. cust_address.info() # The data-type of columns looks fine. Let;s check for the data quality and apply data cleaning process where ever applicable to clean our dataset before performing any analysis. # ## Total Records # In[4]: print("Total records (rows) in the dataset : {}".format(cust_address.shape[0])) print("Total columns (features) in the dataset : {}".format(cust_address.shape[1])) # ## Numeric Columns and Non-Numeric Columns # In[7]: # select numeric columns df_numeric = cust_address.select_dtypes(include=[np.number]) numeric_cols = df_numeric.columns.values print("The numeric columns are : {}".format(numeric_cols)) # select non-numeric columns df_non_numeric = cust_address.select_dtypes(exclude=[np.number]) non_numeric_cols = df_non_numeric.columns.values print("The non-numeric columns are : {}".format(non_numeric_cols)) # ## 1. Missing Values Check # Checking for the presence of any missing values in the dataset. If missing values are present for a particular feature then depending upon the situation the feature may be either dropped (cases when a major amount of data is missing) or an appropiate value will be imputed in the feature column with missing values. # In[7]: # Total number of missing values cust_address.isnull().sum() # In the dataset there are no missing values. # ## 2. Inconsistency Check in Data # We will check whether there is inconsistent data / typo error data is present in the categorical columns.
# The columns to be checked are 'address', 'postcode' ,'state', 'country'. # ### 2.1 State # In[8]: cust_address['state'].value_counts() # Here there are inconsistent data in State column. For New South Wales and Victoria we have two values, one being the full name and the other being their short name. The State names should be standardised and columns with state as New South Wales will be replaced by NSW and columns with state as Victoria will be replaced by VIC. # In[9]: # Function to replace full state names with their short forms. def replace_state_names(state_name): # Making Short Form of State Names as standards if state_name=='New South Wales': return 'NSW' elif state_name=='Victoria': return 'VIC' else : return state_name # Applying the above fuction to state column cust_address['state'] = cust_address['state'].apply(replace_state_names) # After applying the above function the state name is standardised and there is no inconsistency in the state column. # In[10]: cust_address['state'].value_counts() # ### 2.2 Country # In[11]: cust_address['country'].value_counts() # There is no inconsistency of data in the Country column. # ### 2.3 Postcode # The Postcode column looks perfect. There is no inconsistency / typo in the data. # In[12]: cust_address[['address','postcode', 'state' , 'country']].drop_duplicates() # ## 3. Duplication Checks # We need to ensure that there is no duplication of records in the dataset. This may lead to error in data analysis due to poor data quality. If there are duplicate rows of data then we need to drop such records.
For checking for duplicate records we need to firstly remove the primary key column of the dataset then apply drop_duplicates() function provided by Python. # In[13]: # Dropping the primary key column i.e customer_id and storing into a temporary dataframe. cust_address_dedupped = cust_address.drop('customer_id', axis=1).drop_duplicates() print("Number of records after removing customer_id (pk), duplicates : {}".format(cust_address_dedupped.shape[0])) print("Number of records in original dataset : {}".format(cust_address.shape[0])) # Since both the numbers are same. There are no duplicate records in the dataset # ## 4. Exporting the Cleaned Customer Demographic Data Set to csv # Currently the Customer Address dataset is clean. Hence we can export the data to a csv to continue our data analysis of Customer Segments by joining it to other tables. # In[14]: cust_address.to_csv('CustomerAddress_Cleaned.csv', index=False) # ## 5. Checking for Master-Detail Record Counts # Checking with the Master Table (CustomerDemographic_Cleaned.csv) containing the entire Customer Data for the Customer IDs which are getting dropped from the Customer Address Dataset.
# Basically these are the Customers who have an address but are not a part of the Demographics dataset yet. # In[38]: cust_demo_detail = pd.read_csv('CustomerDemographic_Cleaned.csv') # In[39]: cust_demo_detail.head() # In[43]: print("Total Records in Customer_Demographic_Table : {}".format(cust_demo_detail.shape[0])) print("Total Records in Customer_Address_Table : {}".format(cust_address.shape[0])) print('In Demographic Table {} records are getting dropped due to data cleaning process in Demographic Table' .format(cust_address.shape[0]-cust_demo_detail.shape[0])) # #### Customer IDs in Address table getting dropped : # In[53]: cust_drop = cust_address.merge(cust_demo_detail , left_on = 'customer_id', right_on='customer_id' , how='outer') cust_drop.head()