#!/usr/bin/env python # coding: utf-8 # # **Principal Component Analysis for company data** # In[1]: # Pandas and NumPy to deal with data import pandas as pd import numpy as np # Import the required module from sklearn to perform PCA from sklearn.decomposition import PCA # Let's read the dataset `companies.csv` and analyze it # In[2]: df = pd.read_csv("companies.csv") df.head() # Let's visualize both of these columns # In[3]: # we'll use column INDEX (0 and 1) instead of names ("employees" and "revenue_usd") because it's shorter to type! ax = df.plot.scatter(x=0,y=1, title="Company Data") # What about the units used here ??? # ax.axis('equal') # We can see from this data that the revenue of a company is strongly related to the number of employees the company has.
Revenue tends to increase as the number of employees increase. # ## Data Normalization # In[4]: # Import the required module from sklearn for data normalization from sklearn.preprocessing import StandardScaler # StandardScaler is used for data normalization normalize = StandardScaler() # We define a StandardScaler and then we fit it to our data normalize.fit(df) # After running the fit method, the normalize object wil have a mean_ and scale_(std) attribute print("Mean of the data is:", normalize.mean_) print("Standard Deviation of the data is:", normalize.scale_) # Now we can standardize the data using the transform method. numpy_norm = normalize.transform(df) # .transform returns a NumPy array, which we then convert into a Pandas DataFrame. df_norm = pd.DataFrame(numpy_norm, columns=["employees_norm","revenue_usd_norm"]) df_norm.head() # Now, let's visualize the normalized data # In[5]: df_norm.plot.scatter(x=0, y=1, title="Normalized Data"); #
# # #### Can we represent this information in a lower dimensional space? To check this, we can run `PCA` on the normalized data. # In[6]: pca = PCA(n_components=1) # since this is 2-d dataset, we can only reduce it to 1-d! pca.fit(df_norm); data_pca = pca.transform(df_norm) print("The original data has shape", df.shape ) print("The transformed data has shape", data_pca.shape ) # #### Using `PCA` on our data, we have reduced the number of dimensions! Let's see how can we get back the original number of dimensions by doing an inverse transformation. # In[7]: data_inv = pca.inverse_transform(data_pca) print("The inverse transformed data has shape", data_inv.shape) print("This is the same as the shape of the original data!") # Convert the inverse transformed data into a dataframe df_norm_inv = pd.DataFrame(data_inv, columns=df_norm.columns) # #### We now visualize the original and the PCA projected data to see how well PCA performed # # **Aside:** Here we plot two plots on top of each other. This is done by making the axes of the two plots same. We get the axis of the first plot `ax1` and make it same as the axis of the second plot using `ax=ax1`. # # **Aside:** The `alpha` argument makes the plot transparent. We can see that the first plot is lighter and the second is much darker as we ahve passed `alpha=.2` for the first plot and `alpha=1` for the second plot # In[8]: ax1 = df_norm.plot.scatter(x=0, y=1, alpha=.2, color='r', label="Original Data (norm)") df_norm_inv.plot.scatter(x=0, y=1, alpha=1, ax=ax1, label="PCA Projected Data (norm)"); # In[9]: # plot back in the original coordinates! df_inv = pd.DataFrame(normalize.inverse_transform(df_norm_inv), columns=df.columns) ax2 = df.plot.scatter(x=0, y=1, alpha=.2, color='r', label="Original Data") df_inv.plot.scatter(x=0, y=1, alpha=1, ax=ax2, label="PCA Projected Data"); # We can see from the above plot that PCA has done a great job reducing the number of dimensions of the data. The information along the least important principal axis is removed, leaving only the component of the data with the highest variance. # #### How much of the variance is explained by using just one component? # In[10]: pca.explained_variance_ratio_ # --- # # # **Quiz** # # --- # ## Question 1 # > Read the data in `companies.csv`. Fit a StandardScaler on both the `employees` and `revenue_usd` variables. Now we have a data point of a **new** company having **employees = 1020, revenue_usd = 300321**. Find the normalized data point and choose it from the options given. # In[ ]: # read the data dfq1 = pd.read_csv("companies.csv") # fit with StandardScaler to normalize # Make the new datapoint into a new 1-row dataframe and then transform! # ## Question 2 # > Read the data in `companies_extended.csv`. Now run PCA on this dataset, first with 4 components, and then with 2 components. Is the first Principal Component same for both of these runs? #
# > # > To check if they are same, we'll just subtract the two vectors and find the sum. If this sum is zero, we'll know that they are same. Round the sum to 1 decimal place and check if its 0.0 # > # > `NOTE:` Skip data normalization for this question # In[ ]: ### read the data dfq2 = pd.read_csv("companies_extended.csv") ### fit PCA with 2 components and transform ### fit PCA with 4 components and transform ### Check if the first PC is same # ## Question 3 # > Read in `companies_extended.csv`. Reduce the number of features/dimensions to `2` , and then apply the Inverse transform to get the data back into original number of dimensions. What is the mean value of the reconstructed `employees` feature (The first column)?
# > # > *Round the value to two decimal places* # > # >`NOTE:` Skip data normalization for this question # In[ ]: ### read the data dfq3 = pd.read_csv("companies_extended.csv") ### use PCA to reduce the data from 5-D to 2D ### now use inverse transform to get the data back in 5-D ### now report the mean of employee column (The first column)