#!/usr/bin/env python # coding: utf-8 # ## Name : ADVAIT GURUNATH CHAVAN # ## Contact No : +91 70214 55852 # ## Mail ID : advaitchavan135@gmail.com # # # ## Oasis Infobyte Data Science Internship # ## Task 2 : Unemployment Analysis with Python # # ### 1. Importing the necessary dependencies # In[1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from warnings import filterwarnings filterwarnings(action='ignore') import calendar import datetime as dt import plotly.io as plio plio.templates import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from IPython.display import HTML,display # ### 2. Exploring the dataset # In[2]: unemp = pd.read_csv('Unemployment in India.csv') # In[3]: unemp # In[4]: unemp.info() # In[5]: unemp.shape # #### From the dataset inforrmation section we can infer that there are 740 non-null rows in each of the 7 columns # #### Also from dataset shape we can infer that there are 754 rows (null + non-null) in each of the 7 columns # #### So there are 754 - 740 = 14 null values in each of the 7 columns of the dataset # In[6]: unemp.isna().sum() # In[7]: unemp[unemp.isnull().any(axis=1)] # In[8]: print(unemp[unemp.isnull().any(axis=1)].index.tolist()) # #### From above we can infer that there are null-values from row number 359 to 372 in each of the seven columns # #### So we will remove these rows ; and make a new dataset # In[9]: unemp_non_null = unemp.iloc[:, :360].dropna() # In[10]: unemp_non_null # In[11]: unemp_non_null.info() # In[12]: unemp_non_null.shape # In[13]: unemp_non_null.isna().sum() # In[14]: unemp_non_null.Date.min(),unemp_non_null.Date.max() # #### Hence, we have removed all the rows from row number 359 to row number 372 that consisted of null values from all of the 7 columns # #### Since, the null values were present in same rows from 359 to 372 in all of the 7 columns; we removed them without following the process the imputing # ### 3. Data Transformation # In[15]: unemp_non_null['Date'] = pd.to_datetime(unemp_non_null['Date'],dayfirst=True) # In[16]: unemp_non_null['Frequency']= unemp_non_null['Frequency'].astype('category') # In[17]: unemp_non_null['Month'] = unemp_non_null['Date'].dt.month # In[18]: unemp_non_null['Month_num'] = unemp_non_null['Month'].apply(lambda x : int(x)) # In[19]: unemp_non_null['Month_name'] = unemp_non_null['Month_num'].apply(lambda x: calendar.month_abbr[x]) # In[20]: unemp_non_null['Region'] = unemp_non_null['Region'].astype('category') # In[21]: unemp_non_null['Year'] = unemp_non_null['Date'].dt.year unemp_non_null['Year_num'] = unemp_non_null['Year'].apply(lambda x : int(x)) # In[22]: unemp_non_null.drop(columns='Year', inplace=True) # In[23]: ##unemp_non_null.to_csv('unmeployment in India non null.csv', index=False) # In[24]: unemp_non_null # ### 4. Statistical Data Exploration # ###

Estimated Unemployment Rate (%): This represents the actual unemployment rate you want to calculate. It's the percentage of the labor force that is currently unemployed and seeking employment.

# # # ###

Estimated Employed: This is the number of people who are currently employed.

# # ###

Estimated Labour Participation Rate (%): This represents the percentage of the working-age population that is either employed or actively seeking employment. It includes both employed and unemployed individuals.

# In[25]: round(unemp_non_null[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].describe(),2) # #### (A) Feature vs Region(State) # In[26]: feature_vs_region = round(unemp_non_null.groupby(['Region'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_region #feature_vs_region.to_csv('feature_vs_region.csv', index=False) # In[27]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_region['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[68]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Employed', title='Estimated Employed Count vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed Count') fig.update_traces(text=feature_vs_region['Estimated Employed']) fig.update_layout(height=1500, width=1000) # Show the plot fig.show() # In[29]: fig = px.bar(feature_vs_region, x='Region', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Region', template='plotly', color_discrete_sequence=['gold']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_region['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (B) Feature vs Area(Urban and Rural) # In[30]: feature_vs_area = round(unemp_non_null.groupby(['Area'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_area #feature_vs_area.to_excel('feature_vs_year.xlsx', index=False) #feature_vs_area.to_csv('feature_vs_area.csv', index=False) # In[31]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_area['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[32]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Employed', title='Estimated Employed Count vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed') fig.update_traces(text=feature_vs_area['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[33]: fig = px.bar(feature_vs_area, x='Area', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Area', template='plotly') # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_area['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (C) Feature vs Year and Month # In[34]: feature_vs_year_month = round(unemp_non_null.groupby(['Year_num','Month_num', 'Month_name'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index().sort_values(by=['Year_num']),2) feature_vs_year_month # In[35]: feature_vs_year_month['Month_name'] = feature_vs_year_month['Month_name'].astype(str) feature_vs_year_month['Year_num_Month'] = feature_vs_year_month['Year_num'].astype(str) + ' - ' + feature_vs_year_month['Month_name'] feature_vs_year_month = feature_vs_year_month.sort_values(['Year_num', 'Month_name']) feature_vs_year_month #feature_vs_year_month.to_csv('feature_vs_year_month.csv', index=False) # In[36]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[37]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Employed', title='Estimated Employed Count vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Employed') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[38]: # Create the bar plot fig = px.bar(feature_vs_year_month, x='Year_num_Month', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Year and Month', template='plotly', color_discrete_sequence=['red']) # Set the labels for the x and y-axes fig.update_xaxes(title_text='Year and Month') fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') # Add values on the bars fig.update_traces(text=feature_vs_year_month['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # #### (D) Feature vs Year # In[39]: feature_vs_year = round(unemp_non_null.groupby(['Year_num'])[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].mean().reset_index(),2) feature_vs_year #feature_vs_year.to_csv('feature_vs_year.csv', index=False) # In[40]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Unemployment Rate (%)', title='Estimated Unemployment Rate (%) vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Unemployment Rate (%)') fig.update_traces(text=feature_vs_year['Estimated Unemployment Rate (%)'], textposition='outside') # Show the plot fig.show() # In[41]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Employed', title='Estimated Employed Count vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Employed') fig.update_traces(text=feature_vs_year['Estimated Employed'], textposition='outside') # Show the plot fig.show() # In[42]: fig = px.bar(feature_vs_year, x='Year_num', y='Estimated Labour Participation Rate (%)', title='Estimated Labour Participation Rate (%) vs Year', template='plotly', color_discrete_sequence=['aqua']) # Set the labels for the y-axis fig.update_yaxes(title_text='Estimated Labour Participation Rate (%)') fig.update_traces(text=feature_vs_year['Estimated Labour Participation Rate (%)'], textposition='outside') # Show the plot fig.show() # ### 5. Using correlation, pairplot and scatterplot to understand the relation between the features # In[43]: unemp_non_null.corr() # In[44]: sns.heatmap(unemp_non_null.corr(), annot=True) # In[45]: plt.figure(figsize=(20,6)) sns.pairplot(unemp_non_null) plt.show() # In[46]: plt.figure(figsize=(20,20)) fig = px.scatter_matrix(unemp_non_null,template='plotly', dimensions=['Estimated Unemployment Rate (%)','Estimated Employed', 'Estimated Labour Participation Rate (%)'], color='Region') fig.show() # In[47]: plt.figure(figsize=(20,16)) fig = px.scatter_matrix(unemp_non_null,template='plotly', dimensions=['Estimated Unemployment Rate (%)','Estimated Employed', 'Estimated Labour Participation Rate (%)'], color='Area') fig.show() # ### 6. Exploratory Data Analysis # #### [A] Estimated Unemployment Rate(%) # In[48]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Unemployment Rate (%)',color='Region',title='Estimated Unemployment rate(%) vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[49]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Unemployment Rate (%)',color='Area',title='Estimated Unemployment rate(%) vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[50]: df_est_unemp_rate_vs_region = unemp_non_null[['Estimated Unemployment Rate (%)','Region']] df_est_unemp_rate_vs_region = df_est_unemp_rate_vs_region.groupby('Region').mean().reset_index() df_est_unemp_rate_vs_region = df_est_unemp_rate_vs_region.sort_values('Estimated Unemployment Rate (%)') fig = px.bar(df_est_unemp_rate_vs_region, x='Region',y='Estimated Unemployment Rate (%)',color='Region', title='Average Estimated Unemployment Rate(%) in each state',template='plotly', text='Estimated Unemployment Rate (%)', height=1000) fig.show() # In[51]: unemp_2020 = unemp_non_null[unemp_non_null['Year_num'] == 2020] unemp_2019 = unemp_non_null[unemp_non_null['Year_num'] == 2019] # In[52]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2020, x='Region',y='Estimated Unemployment Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Unemployment rate across Region(States) during year 2020', height=900,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[53]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2019, x='Region',y='Estimated Unemployment Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Unemployment rate across Region(States) during year 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[54]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Unemployment Rate (%)'].mean().reset_index(), path=['Region'], values='Estimated Unemployment Rate (%)', color_continuous_scale='Plasma', title='Estimated Unemployment Rate (%) by Region(State)', height=950, template='ggplot2',custom_data=['Estimated Unemployment Rate (%)']) fig.update_traces(textinfo='label+value') fig.show() # ####

From the pieplot, avg. unemployment rate(%) bar plot and box plots we can infer the following:-

# ####

The top 5 regions(states) in India having the highest unemployement rate (%) during COVID-19 lockdown are:

# ####

1. Tripura = 28.35%

# ####

2. Haryana = 26.28%

# ####

3. Jharkhand = 20.59%

# ####

4. Bihar = 18.92%

# ####

5. Himachal Pradesh = 18.54%

# #### [B] Estimated Employed Count # In[55]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Employed',color='Region',title='Estimated Employed Count vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[56]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Employed',color='Area',title='Estimated Employed Count vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[57]: df_est_emp_vs_region = unemp_non_null[['Estimated Employed','Region']] df_est_emp_vs_region = df_est_emp_vs_region.groupby('Region').mean().reset_index() df_est_emp_vs_region = df_est_emp_vs_region.sort_values('Estimated Employed') fig = px.bar(df_est_emp_vs_region, x='Region',y='Estimated Employed',color='Region', title='Average Estimated Employed in each state',template='plotly', text='Estimated Employed', height=1000) fig.show() # In[58]: plt.figure(figsize=(20,25)) fig = px.bar(unemp_2020, x='Region',y='Estimated Employed',animation_frame = 'Month_name' ,color='Area', title='Estimated Employed count across Region(States) in India in 2020', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[59]: plt.figure(figsize=(20,25)) fig = px.bar(unemp_2019, x='Region',y='Estimated Employed',animation_frame = 'Month_name' ,color='Area', title='Estimated Employed count across Region(States) of India in 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[60]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Employed'].mean().reset_index(), path=['Region'], values='Estimated Employed', color_continuous_scale='Plasma', title='Estimated Employed Count by Region(State)', height=1050, template='ggplot2',custom_data=['Estimated Employed']) fig.update_traces(textinfo='label+value') fig.show() # ####

From the pieplot, avg. employed count bar plot and box plots we can infer the following:-

# ####

The top 5 regions(states) in India having the highest employed count during COVID-19 lockdown are:

# ####

1. Uttar Pradesh = 28.09 Million

# ####

2. Maharashtra = 19.99 Million

# ####

3. West Bengal = 17.19 Million

# ####

4. Bihar = 12.37 Million

# ####

5. Tamil Nadu = 12.27 Million

# #### [C] Estimated Labour Participation Rate (%) # In[61]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Region',y='Estimated Labour Participation Rate (%)',color='Region',title='Estimated Labour Participation Rate (%) vs State',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[62]: plt.figure(figsize=(20,15)) fig = px.box(unemp_non_null,x='Area',y='Estimated Labour Participation Rate (%)',color='Area',title='Estimated Labour Participation rate(%) vs Area',template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show() # In[63]: df_est_lab_par_rate_vs_region = unemp_non_null[['Estimated Labour Participation Rate (%)','Region']] df_est_lab_par_rate_vs_region = df_est_lab_par_rate_vs_region.groupby('Region').mean().reset_index() df_est_lab_par_rate_vs_region = df_est_lab_par_rate_vs_region.sort_values('Estimated Labour Participation Rate (%)') fig = px.bar(df_est_lab_par_rate_vs_region, x='Region',y='Estimated Labour Participation Rate (%)',color='Region', title='Average Estimated Labour Participation Rate (%) in each state',template='plotly', text='Estimated Labour Participation Rate (%)', height=1000) fig.show() # In[64]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2020, x='Region',y='Estimated Labour Participation Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Labour Participation Rate (%) across Region(States) during year 2020', height=900,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[65]: plt.figure(figsize=(20,30)) fig = px.bar(unemp_2019, x='Region',y='Estimated Labour Participation Rate (%)',animation_frame = 'Month_name' ,color='Area', title='Estimated Labour Participation Rate (%) across Region(States) during year 2019', height=700,template='plotly') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000 fig.show() # In[66]: fig = px.sunburst(unemp_non_null.groupby(['Region'])['Estimated Labour Participation Rate (%)'].mean().reset_index(), path=['Region'], values='Estimated Labour Participation Rate (%)', color_continuous_scale='Plasma', title='Estimated Labour Participation Rate (%) by Region(State)', height=950, template='ggplot2',custom_data=['Estimated Labour Participation Rate (%)']) fig.update_traces(textinfo='label+value') fig.show() # ####

From the pieplot, avg. labour participation rate(%) bar plot and box plots we can infer the following:-

# ####

The top 5 regions(states) in India having the highest labour participation rate (%) during COVID-19 lockdown are:

# ####

1. Tripura = 61.82%

# ####

2. Meghalaya = 57.08%

# ####

3. Telangana = 53.00%

# ####

4. Gujarat = 46.10%

# ####

5. Sikkim = 46.07%

# In[ ]: