#!/usr/bin/env python # coding: utf-8 # # Most Used Languages for Data Related Jobs # We are going to check the most used programming languages for data related jobs based on Stack Overflow Developer Survey 2021 results. # ### Import Libraries # In[1]: import pandas as pd import numpy as np # ### Load Dataset # In[2]: result_df = pd.read_csv("C:/Users/Marselo/Downloads/Stackoverflow survey results/2021 survey_results_public.csv") # ### Overview of the Dataset # In[3]: result_df.head(5) # In[4]: result_df.info() # ### Filtering the Dataset # In[5]: result_df["MainBranch"].value_counts() # We are primarily interested with those who code for the purpose of work. So we will isolate these two MainBranch: "I am a developer by profession" and "I am not primarily a developer, but I write code sometimes as part of my work". # In[6]: result2_df: pd.DataFrame = result_df.loc[(result_df["MainBranch"]=="I am a developer by profession") | (result_df["MainBranch"]=="I am not primarily a developer, but I write code sometimes as part of my work")] result2_df.head(5) # Drop the rows if DevType column is null. # In[7]: print(result2_df.shape) result2_df = result2_df.dropna(subset="DevType") print(result2_df.shape) # In[8]: result2_df["DevType"].value_counts() # We can see that one person is allowed to choose more than one option for DevType column (separated by ;). Now, we will check options available for DevType column. # In[9]: result2_df.loc[~result2_df["DevType"].str.contains(";"), "DevType"].unique() # These are the available data related jobs for DevType options: # - Data scientist or machine learning specialist # - Data or business analyst # - Engineer, data # - Database administrator # # All data related jobs contain the word "data". # In[10]: data_job_df = result2_df.loc[result2_df["DevType"].str.contains("data", case=False)] data_job_df.info() # Drop the rows if the LanguageHaveWorkedWith column is not available. # In[11]: print(data_job_df.shape) data_job_df = data_job_df.dropna(subset="LanguageHaveWorkedWith") print(data_job_df.shape) # In[12]: data_job_df["DevType"].value_counts() # Since the rows with more than one DevType might skew the language count results, we are going to include only the options below: # "Data scientist or machine learning specialist", "Data or business analyst", "Engineer, data", "Database administrator" # # In[13]: data_job_df = data_job_df[(data_job_df["DevType"] == "Data scientist or machine learning specialist") | (data_job_df["DevType"] == "Data or business analyst") | (data_job_df["DevType"] == "Engineer, data") | (data_job_df["DevType"] == "Database administrator")] data_job_df["DevType"].value_counts() # ### Data Transformation # Now we want to see the programming languages that are used by data related professions. # In[14]: language_df: pd.DataFrame = data_job_df.loc[:,["DevType","LanguageHaveWorkedWith"]] language_df # Similar to DevType, one person is allowed to choose more than one option for LanguageHaveWorkedWith column (separated by ;). Now, we will check the options available for DevType column. # In[15]: languages = language_df.loc[~language_df["LanguageHaveWorkedWith"].str.contains(";"), "LanguageHaveWorkedWith"].unique() for language in languages: print(language) # Now we will make a dataframe for each DevType ("Data scientist or machine learning specialist", "Data or business analyst", "Engineer, data", and "Database administrator"). # In[16]: ds_df = language_df[language_df["DevType"].str.contains("Data scientist or machine learning specialist")] ds_df # In[17]: da_df = language_df[language_df["DevType"].str.contains("Data or business analyst")] de_df = language_df[language_df["DevType"].str.contains("Engineer, data")] dba_df = language_df[language_df["DevType"].str.contains("Database administrator")] # First we create a new dataframe to store the languages and their count based on each DevType. We calculate the total occurrences for each DevType and the occurrences of each language using a `for` loop. C and C++ are not included in the `for` loop because `str.count` is based on Regex pattern, and they would cause error/miscount. # In[18]: lang_count_df = pd.DataFrame() languages = np.delete(languages,(languages == "C") | (languages == "C++")) def add_lang_count(df: pd.DataFrame, job_type: str): lang_count_df.loc["total", job_type] = df.shape[0] for language in languages: lang_count_df.loc[language,job_type] = df["LanguageHaveWorkedWith"].str.count(language).sum() lang_count_df.loc["C", job_type] = df["LanguageHaveWorkedWith"].str.count("C[^\w+#]").sum() lang_count_df.loc["C++", job_type] = df["LanguageHaveWorkedWith"].str.count("C\+\+").sum() # In[19]: add_lang_count(ds_df,"Data Scientist") add_lang_count(da_df, "Data Analyst") add_lang_count(de_df, "Data Engineer") add_lang_count(dba_df, "Database Administrator") lang_count_df # ### Data Analysis and Visualization # Import visualization libraries # In[20]: import plotly.graph_objs as go import plotly.express as px import plotly.offline as pyo pyo.init_notebook_mode() # We will add a new column to calculate the most used programming languages across all data related professions. Note that the sample size of data scientist is larger compared to other groups, and the sample of database administrator is smaller compared to the other groups. # In[21]: total_ser = lang_count_df.sum(axis = 1) lang_count_df["Total"] = total_ser # Normalize the data to percentage, and add a color code to each language. # In[22]: normalize_df = lang_count_df.iloc[1:]/lang_count_df.loc["total"] normalize_df["Color"] = "NA" # In[23]: for n in range(normalize_df.shape[0]): normalize_df.iloc[n, -1] = px.colors.qualitative.Light24[n] normalize_df # Extract the top 5 languages from each job type. `sort_index()` is used to give a better visualization result. # In[24]: top_5_total = normalize_df[["Total","Color"]].sort_values("Total", ascending=False)[:5].sort_index() top_5_ds = normalize_df[["Data Scientist", "Color"]].sort_values("Data Scientist", ascending=False)[:5].sort_index() top_5_da = normalize_df[["Data Analyst", "Color"]].sort_values("Data Analyst", ascending=False)[:5].sort_index() top_5_de = normalize_df[["Data Engineer", "Color"]].sort_values("Data Engineer", ascending=False)[:5].sort_index() top_5_dba = normalize_df[["Database Administrator", "Color"]].sort_values("Database Administrator", ascending=False)[:5].sort_index() top_5_list = [top_5_total, top_5_ds, top_5_da, top_5_de, top_5_dba] for df in top_5_list: df.columns = ["Total", "Color"] top_5_total # In[25]: fig = go.Figure(go.Barpolar( r= top_5_total["Total"], theta= top_5_total.index, width=0.45, marker_color=top_5_total["Color"], marker_line_color="black", marker_line_width=2, opacity=0.7 )) fig.update_layout( template="ggplot2", polar = dict( radialaxis = dict(range=[0, 1.0], showticklabels=False, ticks=''), angularaxis = dict(showticklabels=True, ticks='') ) ) fig.show() # In[26]: from plotly.subplots import make_subplots fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'polar'}]*2]*2, subplot_titles=("Data Scientist", "Data Analyst", "Data Engineer", "Database Administrator")) fig.add_trace(go.Barpolar( r= top_5_ds["Total"], theta= top_5_ds.index, width=0.45, marker_color=top_5_ds["Color"], marker_line_color="black", marker_line_width=2, opacity=0.7 ), 1, 1) fig.add_trace(go.Barpolar( r= top_5_da["Total"], theta= top_5_da.index, width=0.45, marker_color=top_5_da["Color"], marker_line_color="black", marker_line_width=2, opacity=0.7 ), 1, 2) fig.add_trace(go.Barpolar( r= top_5_de["Total"], theta= top_5_de.index, width=0.45, marker_color=top_5_de["Color"], marker_line_color="black", marker_line_width=2, opacity=0.7 ), 2, 1) fig.add_trace(go.Barpolar( r= top_5_dba["Total"], theta= top_5_dba.index, width=0.45, marker_color=top_5_dba["Color"], marker_line_color="black", marker_line_width=2, opacity=0.7 ), 2, 2) fig.update_traces(hovertemplate='%{theta}: %{r:.0%}') fig.update_layout( font_color="darkblue", font_family="Helvetica", title_font_color="darkblue", title_font_family="Helvetica", title_font_size=24, margin = dict(t = 150), paper_bgcolor='#DEDCC6', height = 800, width = 1000, showlegend = False, title=dict(text="Popular Languages for Data Professionals"), template="ggplot2", polar = dict( radialaxis = dict(range=[0, 1.0], showline = False, showticklabels=False, ticks=''), angularaxis = dict(showticklabels=True, ticks='', linecolor='black', linewidth=1.5) ), polar2 = dict( radialaxis = dict(range=[0, 1.0], showline = False, showticklabels=False, ticks=''), angularaxis = dict(showticklabels=True, ticks='', linecolor='black', linewidth=1.5) ), polar3 = dict( radialaxis = dict(range=[0, 1.0], showline = False, showticklabels=False, ticks=''), angularaxis = dict(showticklabels=True, ticks='', linecolor='black', linewidth=1.5) ), polar4 = dict( radialaxis = dict(range=[0, 1.0], showline = False, showticklabels=False, ticks=''), angularaxis = dict(showticklabels=True, ticks='', linecolor='black', linewidth=1.5) ) ) fig.update_annotations(yshift=20) fig.add_annotation({ "x": 1.05, "y": -0.1, "font": {"size": 14}, "text": "tmtsmrsl.github.io", "showarrow": False }) fig.show()