#!/usr/bin/env python # coding: utf-8 # # Social Justice in Programming # # #### 1. Business Understanding # # This project investigates the level of equal opportunities in the field of professional developers, following the CRISP-DM process.
# Survey results from Stackoverflow, which are freely accessible at https://insights.stackoverflow.com/survey, are used as the data basis.

# **Note: The notebook does not claim to fully reflect the complexity of this question. It only elaborates on what the Stackoverflow survey results indicate.* # # **Framing the Problem**
# The questions to be answered in this notebook are: # - Q1 Is there equal opportunity in the programming profession?
#

# - Q2 How does your social environment influence your chances of being successful as a developer?
#

# - Q3 How important is an open mind and tolerance to succeed as a programmer?
#

#
To answer these questions, the first step is to analyze the data base in an exploratory manner. # # In[1]: # Necessary libraries import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np # Let's get a feel for the data set # #### 2. Data Underestanding # In[2]: # Read in the data - gather the data df = pd.read_csv("2017survey_results_public.csv") schema = pd.read_csv("2017survey_results_schema.csv") schema.set_index("Column", inplace=True) # In[3]: # Size of the dataset - assess the data print("shape of the data:", df.shape) print(f"Survey participants: {df.shape[0]}, Questions asked: {df.shape[1]}\n") # Summary of the df print(f"df.info:") print({df.info()}) pd.set_option('display.max_columns', 200) df.head() # The most interesting columns in regard to the questions are: #

# >Q1: Is there equal opportunity in the programming profession? # - **Initial conditons:** Race, HighestEducationParents, Gender # - **Partially influenceable conditions:** Country
# *Note: The country is probably strongly related to ethnicity and needs to be considered in the study.* #

# >Q2: How does your social environment influence your chances of being successful as a developer? # - **Social environment:** HighestEducationParents, FriendsDevelopers #

# >Q3: How important is an open mind and tolerance to succeed as a programmer? # - **Mindset:** RightWrongWay, DiversityImportant #


# To evaluate whether a developer is successful, the following parameters are evaluated: # - **Evaluation parameters:** JobSatisfaction, CareerSatisfaction, Salary # #

# Of course, these evaluation criteria must take into account whether the survey participant is a professional programmer.
# - **Additional parameters:** Professional # # In[4]: # Columns of interest INITIAL_CONDITIONS = ["Race", "HighestEducationParents", "Gender"] PART_INFLUENCEABLE_CONDITIONS = ["Country"] SOCIAL_ENVIRONMENT = ["HighestEducationParents", "FriendsDevelopers"] MINDSET = ["RightWrongWay", "DiversityImportant"] EVALUATION_PARAMETERS = ["JobSatisfaction", "CareerSatisfaction", "Salary"] ADDITIONAL_PARAMETERS = ["Professional"] columns_of_interest = list(set([item for list in [INITIAL_CONDITIONS, PART_INFLUENCEABLE_CONDITIONS, SOCIAL_ENVIRONMENT, MINDSET, ADDITIONAL_PARAMETERS, EVALUATION_PARAMETERS] for item in list])) # In[5]: # Let's take a look at the questions behind the columns for index, row in schema.loc[columns_of_interest].iterrows(): question = schema.loc[index, "Question"] print(f"{index}: {question}") # In[6]: # Build a df containing only the columns of interest df_filtered = df[columns_of_interest].copy() df_filtered.head() # In[7]: # Datatypes in the dataset print(df_filtered.dtypes) cat_cols = list(df_filtered.columns[df_filtered.dtypes=="object"]) print(f"categorical cols: {cat_cols}") # In[8]: # Ditribution of the quantitative parameters df_filtered.describe() # #### 3. Data Preparation # In[9]: # Proportions of missing evaluation parameters prop_missing_salary = df_filtered["Salary"].isnull().mean() prop_missing_jobsf = df_filtered["JobSatisfaction"].isnull().mean() prop_missing_careersf = df_filtered["CareerSatisfaction"].isnull().mean() print("Missing evaluation parameters") print(f"Salary: {round(prop_missing_salary*100, 1)} %") print(f"JobSatisfaction: {round(prop_missing_jobsf*100, 1)} %") print(f"CareerSatisfaction: {round(prop_missing_careersf*100, 1)} %") # Missing evaluation parameters combined print("\nData left after removing every row with missing evaluation parameters:") # We can safely drop rows where all parameters are nan df_filtered.dropna(subset=cat_cols, how="all", inplace=True) # Build a df containing only rows with the needed evaluation criteria - clean the data df_eval = df_filtered.dropna(subset=EVALUATION_PARAMETERS, axis=0).copy() # Due to the questions above we are only interested in data from professional developers data_left = round((df_eval[df_eval["Professional"]=="Professional developer"].shape[0]/ df_filtered[df_filtered["Professional"]=="Professional developer"].shape[0])*100, 1) print(f"{data_left} %, -> {df_eval.shape[0]}") print(f"\nshape of the df: {df_eval.shape}") # In order not to lose that much data, we could impute values in this step. However, this reduces the significance of our data, since we reduce the variability.
# Because our df has a lot of rows in proportion to the columns, we continue without imputation for now. # In the df df_eval, due to the filtering with the salary, there should be only survey participants whose professional status is professional developer.
# This can be easily checked using the "Professional" column. # In[10]: print(df_eval["Professional"].value_counts()) # The df is now filtered so we can take a look at the distribution of the selected parameters in the data set. # In[11]: # analyze the data # Plot only results where there are at least 50 survey participants ethnicity_counts = df_eval["Race"].value_counts()[df_eval["Race"].value_counts() >=50] # Plot only results where there are at least 400 survey participants country_counts = df_eval["Country"].value_counts()[df_eval["Country"].value_counts() >=400] # Plot only result where there are at least 10 survey participants gender_counts = df_eval["Gender"].value_counts()[df_eval["Gender"].value_counts()>=10] # In[12]: # Default plot for data overview def do_standard_plot(df, title, plot_type, size=[6,6], use_y_grid=False): """[Takes a df containing a distribution and creates a plot from it] Args: df ([pandas.DataFrame]): [df containing a distribution] title ([str]): [title of the plot] size (list, optional): [size of the figure]. Defaults to [8,8]. size ([bool, optional]): [True for y-grid]. Defaults to False. """ if use_y_grid: _, ax = plt.subplots() df.plot(kind=plot_type, figsize=size); plt.title(title) if use_y_grid: plt.grid(axis="y") ax.set_axisbelow(True) plt.ylabel("Survey participants") # Ethnicity distribution print(ethnicity_counts) do_standard_plot(ethnicity_counts, "Ethnicity distribution", "pie") # It is clearly evident that one ethnicity in the chart is clearly overweighted.
# Since only ethnicities to which at least 50 survey participants belong are shown here,
# it should still be possible to make a reasonably statistically reliable statement with these.
# We can neglect the answers 'I prefer not to say' and 'I don't know'.

# So we focus on the following groups:
# *['White or of European descent', 'South Asian',
# 'Hispanic or Latino/Latina', 'East Asian', 'Middle Eastern',
# 'Hispanic or Latino/Latina; White or of European descent',
# 'Black or of African descent', 'Middle Eastern; White or of European descent']* # In[13]: ethnicities_in_focus = ['White or of European descent', 'South Asian', 'Hispanic or Latino/Latina', 'East Asian', 'Middle Eastern', 'Hispanic or Latino/Latina; White or of European descent', 'Black or of African descent', 'Middle Eastern; White or of European descent'] # In[14]: # Country distribution do_standard_plot(country_counts, "Country distribution", "bar", [8,6], True) # In terms of countries, there is a bias towards the U.S., although the data is much more balanced on this point.
# The countries that we want to consider in the evaluation are:
# *['United States', 'United Kingdom', 'Germany', 'India', 'Canada', 'France']* # In[15]: countries_in_focus = list(country_counts.index) # In[16]: # Gender distribution do_standard_plot(gender_counts, "Gender distribution", "bar", [8,6], True) # If you look at the gender distribution, you can see, as you would expect, that the profession of developer is strongly dominated by males.
# Since we can't make reasonable statements, with too few survey participants for a gender, we will divide this property into 3 groups.
# The first two groups are *female* and *male*. All remaining answers are summarized under the term *Other*.
# In[17]: # Disable chained assignments pd.options.mode.chained_assignment = None # Source: https://stackoverflow.com/questions/49728421/pandas-dataframe-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy # Reworked gender distribution df_eval["Gender"][[element not in ["Male", "Female", np.nan] for element in df_eval["Gender"]]] = "Other" print(df_eval["Gender"].value_counts()) do_standard_plot(df_eval["Gender"].value_counts(), "Gender distribution", "pie") # In[18]: #Missing values left df_eval.isnull().sum() # #### 4. Evaluation # This should be enough for a first overview.
# So let's take a look at question number 1:
# **Q1 Is there equal opportunity in the programming profession?**

# *Note: The OverallSatisfaction is introduced as the sum of the job and the career satisfaction* # In[19]: def add_overall_sf(df): """[Adds the OverallSatifsction column to a df as the sum of job and career satisfaction] Args: df ([pandas.DataFrame]): [Pandas df that contains the columns "JobSatisfaction" and "CareerSatisfaction"] """ df["OverallSatisfaction"] = df["JobSatisfaction"] + df["CareerSatisfaction"] # In[20]: # Evaluation parameter by ethnicity ethnicity_satifaction = df_eval.groupby("Race").mean().loc[ethnicities_in_focus].sort_values("Salary", ascending=False) add_overall_sf(ethnicity_satifaction) ethnicity_satifaction # The table shows that a low salary does not necessarily mean low satisfaction.
# The salary varies so significantly that in the next step we have to take the countries into account.
# It is important to note that different countries have different salary levels and that there is an uneven distribution of ethnic groups across countries. # # In[21]: def plot_multiple_bar_plot(data, x="Country", y="Salary", hue="Race", title="Salary by country and ethnicity", palette=["tab:blue","tab:green","tab:red","tab:cyan","tab:pink","tab:olive","tab:gray","tab:orange"]): """[Creates a plot in which each x-tick is divided again using several bars in different colors] Args: data ([pandas.DataFrame]): [A df containing the x-ticks, the categories for the hues, and the corresponding y-values] x (str, optional): [Name of the column with the values for the x-ticks]. Defaults to "Country". y (str, optional): [Name of the column with the values for the y-axis]. Defaults to "Salary". hue (str, optional): [Name of the column with the categories that are divided into different colored bars.]. Defaults to "Race". title (str, optional): [Title of the plot]. Defaults to "Salary by country and ethnicity". palette (list, optional): [seaborn.color_palette or list of matplotlib colors.]. Defaults to ["tab:blue","tab:green","tab:red", "tab:cyan","tab:pink","tab:olive", "tab:gray","tab:orange"]. """ _, ax = plt.subplots() sns.barplot(x=x, y=y, hue=hue, data=data, palette=palette) plt.gcf().set_size_inches(14, 7) plt.legend(loc="upper center") plt.title(title) plt.grid(axis="y") ax.set_axisbelow(True) # Plot salary by country and ethnicity ethnicity_sf_countries = df_eval.groupby(["Race", "Country"], as_index=False)[EVALUATION_PARAMETERS].mean() # Select only ethnicities and countries in focus ethnicity_sf_countries_if = ethnicity_sf_countries[(ethnicity_sf_countries["Race"].isin(ethnicities_in_focus)) & (ethnicity_sf_countries["Country"].isin(countries_in_focus))] # visualize results plot_multiple_bar_plot(ethnicity_sf_countries_if) # We can quickly see that there is not enough data available for India, so the plot is made again without India. # In[22]: # Select only ethnicities and countries in focus - without India countries_in_focus_no_india = countries_in_focus.copy() countries_in_focus_no_india.remove("India") ethnicity_sf_countries_if_mod = ethnicity_sf_countries[(ethnicity_sf_countries["Race"].isin(ethnicities_in_focus)) & (ethnicity_sf_countries["Country"].isin(countries_in_focus_no_india))] plot_multiple_bar_plot(ethnicity_sf_countries_if_mod) # We can clearly see that some ethnicities are disadvantaged. The "Black or of African descent" group in particular has the lowest or second lowest salary in all countries considered. # In[23]: # Standard diagram for the visualization of the distribution of the evaluation parameters def plot_eval_param_dist(df, title, parameter, x_lim=[13, 16], palette=["tab:blue","tab:green","tab:red","tab:cyan","tab:pink","tab:olive", "tab:gray","tab:orange","tab:purple","tab:brown"]): """[Plots the distribution of a parameter in a bar chart] Args: df ([pandas.DataFrame]): [A df containing a distribution over "parameter"] title ([str]): [Title of the plot] parameter ([str]): [Evaluation parameter to plot] x_lim (list, optional): [x-Axis limit of the plot]. Defaults to [12, 16]. palette (list, optional): [seaborn.color_palette or list of matplotlib colors.]. Defaults to ["tab:blue","tab:green","tab:red", "tab:cyan","tab:pink","tab:olive", "tab:gray","tab:orange","tab:purple", "tab:brown"]. """ df.sort_values(parameter, ascending=False, inplace=True) _, ax = plt.subplots() sns.barplot(x=df[parameter], y=df.index, palette=palette) plt.title(title) plt.xlim(x_lim) plt.grid(axis="x") ax.set_axisbelow(True) # Plot overall satisfaction by ethnicity plot_eval_param_dist(ethnicity_satifaction, "Overall satisfaction by ethnicity", "OverallSatisfaction", palette=["tab:red", "tab:cyan", "tab:olive", "tab:orange", "tab:pink", "tab:green", "tab:gray", "tab:blue"]) # We can see again that the group "Black or of African descent" is clearly disadvantaged. People from Asia also only achieve a low level of overall satisfaction. # In order to see which parameters affect general satisfaction and to what extent,
# the ranges between maximum and minimum satisfaction are stored in a dict. # In[24]: range_oa_sf = {} def save_range_in_oa_sf(df, parameter): """[The function saves the range between maximum and minimum OverallSatisfaction depending on the category/parameter in a dict] Args: df ([pandas.DataFrame]): [A df containing a distribution over the OverallSatisfaction] parameter ([str]): [The category to which the distribution belongs] """ global range_oa_sf range_oa_sf[parameter] = round(df["OverallSatisfaction"].max()-df["OverallSatisfaction"].min(),2) # Safe range save_range_in_oa_sf(ethnicity_satifaction, "Ethnicity") # In[25]: # Available data HighestEducationParents available_data_hep = len(df_eval["HighestEducationParents"]) - sum(df_eval["HighestEducationParents"].isnull()) print(f"Available data HighestEducationParents: {available_data_hep}") # Evaluation parameter by HighestEducationParents pe_satifaction = df_eval.groupby("HighestEducationParents").mean().sort_values("Salary", ascending=False) add_overall_sf(pe_satifaction) pe_satifaction # In[26]: # Plot salary by HighestEducationParents plot_eval_param_dist(pe_satifaction, "Salary by HighestEducationParents", "Salary", x_lim=[37500,70000]) # The chart clearly shows that the salary earned is heavily dependent on the education of the parents. # In[27]: # Plot overall satisfaction by HighestEducationParents plot_eval_param_dist(pe_satifaction, "Overall satisfaction by HighestEducationParents", "OverallSatisfaction", x_lim=[13.5,15], palette=["tab:blue","tab:cyan","tab:red","tab:green","tab:pink","tab:orange","tab:olive","tab:brown","tab:gray","tab:purple"]) # Save satisfaction range for HighestEducationParents save_range_in_oa_sf(pe_satifaction, "HighestEducationParents") # As far as overall satisfaction is concerned, the graph is a little less clear. Nevertheless, we can clearly see that a high level
# of overall satisfaction is more likely to be achieved by people whose parents have achieved a high academic degree. The parents of
# the three most satisfied groups on average all have an academic degree. # In[28]: # Evaluation parameter by Gender gender_satifaction = df_eval.groupby("Gender").mean().sort_values("Salary", ascending=False) add_overall_sf(gender_satifaction) gender_satifaction # In[29]: # Plot salary by Gender plot_eval_param_dist(gender_satifaction, "Salary by gender", "Salary", x_lim=[55000,63000], palette=["tab:red","tab:blue","tab:green"]) # Contrary to what one might expect, men in the developer profession earn the lowest salaries.
# One reason for this could be that the proportion of men in lower-income countries is significantly higher.
# In order to check this, the salary distribution of the different genders across the countries must be considered. # In[30]: # Plot salary by country and Gender gender_sf_countries = df_eval.groupby(["Gender", "Country"], as_index=False)[EVALUATION_PARAMETERS].mean() # Select only countries in focus gender_sf_countries_if = gender_sf_countries[gender_sf_countries["Country"].isin(countries_in_focus)] plot_multiple_bar_plot(gender_sf_countries_if, hue="Gender", title="Salary by country and gender") # When comparing countries, we can see that women are clearly disadvantaged.
# For people who assign themselves to a gender other than male or female,
# whether they belong to the best or the lowest paid group depends heavily on the country. # In[31]: # Plot overall satisfaction by Gender plot_eval_param_dist(gender_satifaction, "Overall satisfaction by gender", "OverallSatisfaction", x_lim=[13.5,14.8], palette=["tab:green","tab:blue","tab:red"]) # Save satisfaction range for Gender save_range_in_oa_sf(gender_satifaction, "Gender") # When it comes to general job and career satisfaction, the graph is reversed. Here the men are the happiest. # In[32]: # range overall satifaction for the initial conditions range_oa_sf # Overall, we can state that ethnicity has the greatest impact on the level of satisfaction achieved.
# As the graphics show, all initial conditions have an impact on salary and satisfaction.
# Question Q1 can be clearly answered in the negative. **There is no equal opportunity in the job profile of professional developers.** # Let's continue with question 2:
# **Q2 How does your social environment influence your chances of being successful as a developer?**

# We have already seen clearly that the parents education not only has an extreme impact on salary, but also on job satisfaction.
# Now we have to check how the circle of friends affect the success. Here, however, it must be taken into account that a correlation does not necessarily mean a causality.
# If you are successful as a programmer, you have a lot to do with other developers and the probability is high that you will find new friends among them.
# In addition, many friendships are made during an academic career and in our society people with high degrees tend to earn higher incomes. # In[33]: # Available data FriendsDevelopers available_data_fd = len(df_eval["FriendsDevelopers"]) - sum(df_eval["FriendsDevelopers"].isnull()) print(f"Available data FriendsDevelopers: {available_data_fd}") # Evaluation parameter by FriendsDevelopers friends_dev_satifaction = df_eval.groupby("FriendsDevelopers").mean().sort_values("Salary", ascending=False) add_overall_sf(friends_dev_satifaction) friends_dev_satifaction # The table shows that the general level of satisfaction is very balanced.
What is more interesting for a graphical visualisation is the correlation between social circle and salary. # In[34]: # Plot salary in relation to friends from the same professional field plot_eval_param_dist(friends_dev_satifaction, "Salary / Are your friends developers?", "Salary", x_lim=[50000,62000]) # Contrary to my original assumption, we can see from the graph that people whose circle of friends includes few developers have the highest salary.
# The people who mainly surround themselves with developers earn even the lowest salary. # With regard to Q2, it can be said that the social environment has a significant impact on success as a programmer.
# The parents in particular have a major impact. The higher the educational level of the parents,
# the more likely it is to achieve a high salary and the greater the chances of being satisfied in one's job/career.
# As far as the circle of friends is concerned, it is advantageous in terms of salary not only to surround oneself
# with friends of the same professional orientation. However, this parameter is irrelevant for the level of satisfaction achieved. # Finally, we continue with question 3:
# **Q3 How important is an open mind and tolerance to succeed as a programmer?** # In[35]: # Available data RightWrongWay available_data_rw = len(df_eval["RightWrongWay"]) - sum(df_eval["RightWrongWay"].isnull()) print(f"Available data RightWrongWay: {available_data_rw}") # Evaluation parameter by RightWrongWay right_wrong_satifaction = df_eval.groupby("RightWrongWay").mean().sort_values("Salary", ascending=False) add_overall_sf(right_wrong_satifaction) right_wrong_satifaction # As with the circle of friends, this parameter does not have a high impact on satisfaction.
# It remains to be said, however, that the people who believe that there is a right and wrong way for everything are the most unsatisfied.
# In terms of salary, the more open a developer is to different solutions, the higher the salary he achieves. # # In[36]: # Plot the relation of salary to whether people believe there is a right way and a wrong way to do everything plot_eval_param_dist(right_wrong_satifaction, "Salary / There's a right and a wrong way to do everything", "Salary", x_lim=[35000,67000], palette=["tab:pink","tab:green","tab:red","tab:blue","tab:cyan"]) # Since the difference in salary is so significant, we should look at this again in a country comparison. # In[37]: # Plot salary by country and RightWrongWay right_wrong_sf_countries = df_eval.groupby(["RightWrongWay", "Country"], as_index=False)[EVALUATION_PARAMETERS].mean() # Select only countries in focus right_wrong_sf_countries_if = right_wrong_sf_countries[right_wrong_sf_countries["Country"].isin(countries_in_focus)] plot_multiple_bar_plot(right_wrong_sf_countries_if, hue="RightWrongWay", title="Salary / There's a right and a wrong way to do everything") # We can see that in all the countries considered, people who believe that there is only one right way to solve a problem earn less than those who strongly reject this belief. # In[38]: # Available data DiversityImportant available_data_di = len(df_eval["DiversityImportant"]) - sum(df_eval["DiversityImportant"].isnull()) print(f"Available data DiversityImportant: {available_data_di}") # Evaluation parameter by DiversityImportant diversity_imp_satifaction = df_eval.groupby("DiversityImportant").mean().sort_values("Salary", ascending=False) add_overall_sf(diversity_imp_satifaction) diversity_imp_satifaction # In[39]: # Plot the relation of overall satisfaction to whether people believe that Diversity in the workplace is important in percent diversity_imp_satifaction.sort_values("OverallSatisfaction", ascending=False, inplace=True) _, ax = plt.subplots() sns.barplot(x=100*(diversity_imp_satifaction["OverallSatisfaction"]/max(diversity_imp_satifaction["OverallSatisfaction"])), y=diversity_imp_satifaction.index) plt.title("Overall satisfaction / Diversity in the workplace is important") plt.xlabel("OverallSatisfaction[%]") plt.xlim([90, 100]) plt.grid(axis="x") ax.set_axisbelow(True) # On the question of whether diversity is important in the workplace, it appears that people with a clear opinion on this earn the most.
# The group that is strongly opposed to diversity has an average income that is about €1500 higher, but is also more than 5% less satisfied than those for whom diversity is important. # To answer Q3, it is important to note that it is extremely important to be open to different solution approaches when it comes to maximizing salary.
# In addition, a clear opinion can help to maximize the salary. It should be taken into account that open people are happier. # # #### 5. Depleoyment # For the condensed presentation of the results, check out this medium post: # https://medium.com/@florianpue/an-inconvenient-truth-about-social-justice-in-programming-f78fb17c614b