#!/usr/bin/env python
# coding: utf-8
# ## Name : ADVAIT GURUNATH CHAVAN
# ## Contact No : +91 70214 55852
# ## Mail ID : advaitchavan135@gmail.com
#
#
# ## Oasis Infobyte Data Science Internship
# ## Task 2 : Unemployment Analysis with Python
#
# ### 1. Importing the necessary dependencies
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings(action='ignore')
import calendar
import datetime as dt
import plotly.io as plio
plio.templates
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import HTML,display
# ### 2. Exploring the dataset
# In[2]:
unemp = pd.read_csv('Unemployment in India.csv')
# In[3]:
unemp
# In[4]:
unemp.info()
# In[5]:
unemp.shape
# #### From the dataset inforrmation section we can infer that there are 740 non-null rows in each of the 7 columns
# #### Also from dataset shape we can infer that there are 754 rows (null + non-null) in each of the 7 columns
# #### So there are 754 - 740 = 14 null values in each of the 7 columns of the dataset
# In[6]:
unemp.isna().sum()
# In[7]:
unemp[unemp.isnull().any(axis=1)]
# In[8]:
print(unemp[unemp.isnull().any(axis=1)].index.tolist())
# #### From above we can infer that there are null-values from row number 359 to 372 in each of the seven columns
# #### So we will remove these rows ; and make a new dataset
# In[9]:
unemp_non_null = unemp.iloc[:, :360].dropna()
# In[10]:
unemp_non_null
# In[11]:
unemp_non_null.info()
# In[12]:
unemp_non_null.shape
# In[13]:
unemp_non_null.isna().sum()
# In[14]:
unemp_non_null.Date.min(),unemp_non_null.Date.max()
# #### Hence, we have removed all the rows from row number 359 to row number 372 that consisted of null values from all of the 7 columns
# #### Since, the null values were present in same rows from 359 to 372 in all of the 7 columns; we removed them without following the process the imputing
# ### 3. Data Transformation
# In[15]:
unemp_non_null['Date'] = pd.to_datetime(unemp_non_null['Date'],dayfirst=True)
# In[16]:
unemp_non_null['Frequency']= unemp_non_null['Frequency'].astype('category')
# In[17]:
unemp_non_null['Month'] = unemp_non_null['Date'].dt.month
# In[18]:
unemp_non_null['Month_num'] = unemp_non_null['Month'].apply(lambda x : int(x))
# In[19]:
unemp_non_null['Month_name'] = unemp_non_null['Month_num'].apply(lambda x: calendar.month_abbr[x])
# In[20]:
unemp_non_null['Region'] = unemp_non_null['Region'].astype('category')
# In[21]:
unemp_non_null['Year'] = unemp_non_null['Date'].dt.year
unemp_non_null['Year_num'] = unemp_non_null['Year'].apply(lambda x : int(x))
# In[22]:
unemp_non_null.drop(columns='Year', inplace=True)
# In[23]:
##unemp_non_null.to_csv('unmeployment in India non null.csv', index=False)
# In[24]:
unemp_non_null
# ### 4. Statistical Data Exploration
# ###
Estimated Unemployment Rate (%): This represents the actual unemployment rate you want to calculate. It's the percentage of the labor force that is currently unemployed and seeking employment.
# # # ###Estimated Employed: This is the number of people who are currently employed.
# # ###Estimated Labour Participation Rate (%): This represents the percentage of the working-age population that is either employed or actively seeking employment. It includes both employed and unemployed individuals.
# In[25]: round(unemp_non_null[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].describe(),2) # #### (A) Feature vs Region(State) # In[26]: feature_vs_region = round(unemp_non_null.groupby(['Region'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_region #feature_vs_region.to_csv('feature_vs_region.csv', index=False) # In[27]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_region['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[68]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Employed', title='Estimated Employed Count vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed Count') fig.update_traces(text=feature_vs_region['Estimated Employed']) fig.update_layout(height=1500, width=1000) # Show the plot fig.show() # In[29]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_region['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (B) Feature vs Area(Urban and Rural) # In[30]: feature_vs_area = round(unemp_non_null.groupby(['Area'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_area #feature_vs_area.to_excel('feature_vs_year.xlsx', index=False) #feature_vs_area.to_csv('feature_vs_area.csv', index=False) # In[31]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_area['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[32]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Employed', title='Estimated Employed Count vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed') fig.update_traces(text=feature_vs_area['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[33]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_area['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (C) Feature vs Year and Month # In[34]: feature_vs_year_month = round(unemp_non_null.groupby(['Year_num','Month_num', 'Month_name'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index().sort_values(by=['Year_num']),2) feature_vs_year_month # In[35]: feature_vs_year_month['Month_name'] = feature_vs_year_month['Month_name'].astype(str) feature_vs_year_month['Year_num_Month'] = feature_vs_year_month['Year_num'].astype(str) + ' - ' + feature_vs_year_month['Month_name'] feature_vs_year_month = feature_vs_year_month.sort_values(['Year_num', 'Month_name']) feature_vs_year_month #feature_vs_year_month.to_csv('feature_vs_year_month.csv', index=False) # In[36]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[37]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Employed', title='Estimated Employed Count vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Employed') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[38]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (D) Feature vs Year # In[39]: feature_vs_year = round(unemp_non_null.groupby(['Year_num'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_year #feature_vs_year.to_csv('feature_vs_year.csv', index=False) # In[40]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_year['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[41]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Employed', title='Estimated Employed Count vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed') fig.update_traces(text=feature_vs_year['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[42]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_year['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # ### 5. Using correlation, pairplot and scatterplot to understand the relation between the features # In[43]: unemp_non_null.corr() # In[44]: sns.heatmap(unemp_non_null.corr(), annot=True) # In[45]: plt.figure(figsize=(20,6)) sns.pairplot(unemp_non_null) plt.show() # In[46]: plt.figure(figsize=(20,20)) fig = px.scatter_matrix(unemp_non_null,template='plotly', dimensions=['Estimated Unemployment Rate (%)','Estimated Employed', 'Estimated Labour Participation Rate (%)'], color='Region') fig.show() # In[47]: plt.figure(figsize=(20,16)) fig = px.scatter_matrix(unemp_non_null,template='plotly', dimensions=['Estimated Unemployment Rate (%)','Estimated Employed', 'Estimated Labour Participation Rate (%)'], color='Area') fig.show() # ### 6. Exploratory Data Analysis # #### [A] Estimated Unemployment Rate(%) # In[48]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Unemployment Rate (%)',color='Region',title='Estimated Unemployment rate(%) vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[49]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Unemployment Rate (%)',color='Area',title='Estimated Unemployment rate(%) vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[50]: df_est_unemp_rate_vs_region = unemp_non_null[['Estimated Unemployment Rate (%)','Region']] df_est_unemp_rate_vs_region = df_est_unemp_rate_vs_region.groupby('Region').mean().reset_index() df_est_unemp_rate_vs_region = df_est_unemp_rate_vs_region.sort_values('Estimated Unemployment Rate (%)') fig = px.bar(df_est_unemp_rate_vs_region, x='Region',y='Estimated Unemployment Rate (%)',color='Region', title='Average Estimated Unemployment Rate(%) in each state',template='plotly', text='Estimated Unemployment Rate (%)', height=1000) fig.show() # In[51]: unemp_2020 = unemp_non_null[unemp_non_null['Year_num'] == 2020] unemp_2019 = unemp_non_null[unemp_non_null['Year_num'] == 2019] # In[52]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2020, x='Region',y='Estimated Unemployment Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Unemployment rate across Region(States) during year 2020', height=900,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[53]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2019, x='Region',y='Estimated Unemployment Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Unemployment rate across Region(States) during year 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[54]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Unemployment Rate (%)'].mean().reset_index(), path=['Region'], values='Estimated Unemployment Rate (%)', color_continuous_scale='Plasma', title='Estimated Unemployment Rate (%) by Region(State)', height=950, template='ggplot2',custom_data=['Estimated Unemployment Rate (%)']) fig.update_traces(textinfo='label+value') fig.show() # ####From the pieplot, avg. unemployment rate(%) bar plot and box plots we can infer the following:-
# ####The top 5 regions(states) in India having the highest unemployement rate (%) during COVID-19 lockdown are:
# ####1. Tripura = 28.35%
# ####2. Haryana = 26.28%
# ####3. Jharkhand = 20.59%
# ####4. Bihar = 18.92%
# ####5. Himachal Pradesh = 18.54%
# #### [B] Estimated Employed Count # In[55]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Employed',color='Region',title='Estimated Employed Count vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[56]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Employed',color='Area',title='Estimated Employed Count vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[57]: df_est_emp_vs_region = unemp_non_null[['Estimated Employed','Region']] df_est_emp_vs_region = df_est_emp_vs_region.groupby('Region').mean().reset_index() df_est_emp_vs_region = df_est_emp_vs_region.sort_values('Estimated Employed') fig = px.bar(df_est_emp_vs_region, x='Region',y='Estimated Employed',color='Region', title='Average Estimated Employed in each state',template='plotly', text='Estimated Employed', height=1000) fig.show() # In[58]: plt.figure(figsize=(20,25)) fig = px.bar(unemp_2020, x='Region',y='Estimated Employed',animation_frame = 'Month_name' ,color='Area', title='Estimated Employed count across Region(States) in India in 2020', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[59]: plt.figure(figsize=(20,25)) fig = px.bar(unemp_2019, x='Region',y='Estimated Employed',animation_frame = 'Month_name' ,color='Area', title='Estimated Employed count across Region(States) of India in 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[60]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Employed'].mean().reset_index(), path=['Region'], values='Estimated Employed', color_continuous_scale='Plasma', title='Estimated Employed Count by Region(State)', height=1050, template='ggplot2',custom_data=['Estimated Employed']) fig.update_traces(textinfo='label+value') fig.show() # ####From the pieplot, avg. employed count bar plot and box plots we can infer the following:-
# ####The top 5 regions(states) in India having the highest employed count during COVID-19 lockdown are:
# ####1. Uttar Pradesh = 28.09 Million
# ####2. Maharashtra = 19.99 Million
# ####3. West Bengal = 17.19 Million
# ####4. Bihar = 12.37 Million
# ####5. Tamil Nadu = 12.27 Million
# #### [C] Estimated Labour Participation Rate (%) # In[61]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Labour Participation Rate (%)',color='Region',title='Estimated Labour Participation Rate (%) vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[62]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Labour Participation Rate (%)',color='Area',title='Estimated Labour Participation rate(%) vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[63]: df_est_lab_par_rate_vs_region = unemp_non_null[['Estimated Labour Participation Rate (%)','Region']] df_est_lab_par_rate_vs_region = df_est_lab_par_rate_vs_region.groupby('Region').mean().reset_index() df_est_lab_par_rate_vs_region = df_est_lab_par_rate_vs_region.sort_values('Estimated Labour Participation Rate (%)') fig = px.bar(df_est_lab_par_rate_vs_region, x='Region',y='Estimated Labour Participation Rate (%)',color='Region', title='Average Estimated Labour Participation Rate (%) in each state',template='plotly', text='Estimated Labour Participation Rate (%)', height=1000) fig.show() # In[64]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2020, x='Region',y='Estimated Labour Participation Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Labour Participation Rate (%) across Region(States) during year 2020', height=900,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[65]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2019, x='Region',y='Estimated Labour Participation Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Labour Participation Rate (%) across Region(States) during year 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[66]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Labour Participation Rate (%)'].mean().reset_index(), path=['Region'], values='Estimated Labour Participation Rate (%)', color_continuous_scale='Plasma', title='Estimated Labour Participation Rate (%) by Region(State)', height=950, template='ggplot2',custom_data=['Estimated Labour Participation Rate (%)']) fig.update_traces(textinfo='label+value') fig.show() # ####From the pieplot, avg. labour participation rate(%) bar plot and box plots we can infer the following:-
# ####The top 5 regions(states) in India having the highest labour participation rate (%) during COVID-19 lockdown are:
# ####1. Tripura = 61.82%
# ####2. Meghalaya = 57.08%
# ####3. Telangana = 53.00%
# ####4. Gujarat = 46.10%
# ####5. Sikkim = 46.07%
# In[ ]: