#!/usr/bin/env python # coding: utf-8 # # Stackoverflow 2019 Survey results analysis and insights extraction about OSS contributions # This project follows the [CRISP-DM](https://www.datasciencecentral.com/profiles/blogs/crisp-dm-a-standard-methodology-to-ensure-a-good-outcome) process to answer the followig questions: # - How often do developers contribute to OSS? # - Do Hobyist developers contribute more often to OSS? # - Does OSS quality perception play a bias role towards OSS contribution? # - Are experienced developers contributing more frequently to OSS? # - Do developers contributing to the OSS have a higher income? # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np import utils # # 1. Data understanding # In[2]: df = pd.read_csv("../data/so_survey_2019/survey_results_public.csv") schema = pd.read_csv("../data/so_survey_2019/survey_results_schema.csv") eu_countries = pd.read_csv("../data/listofeucountries.csv") df.shape # In[3]: schema.head() # In[4]: df.head() # In[5]: df.columns # In[6]: df['DevType'].dropna() # ## Check the ratio of missing values in each column # In[7]: missing_values_mean = (df.isna().mean().round(4) * 100) missing_values_mean.sort_values(ascending=False)[:15] # ## Which columns have no missing values? # In[8]: print("columns with no missing values: ", set(df.columns[df.isnull().mean() == 0.0])) # ## Columns with less than 15% of the values are missing # In[9]: set(missing_values_mean[missing_values_mean < 15].sort_values(ascending=True).index) # # 2. Exploratory Data Analysis # ## Respondents Countries # In[10]: countries_vals = df.Country.value_counts() countries_vals_percentage = (countries_vals[:15] * 100 / df.Country.dropna().shape[0]) # colors_list = ['#5cb85c','#5bc0de','#d9534f'] # ax = countries_vals_percentage.plot(kind="bar", color = colors_list) ax = countries_vals_percentage.plot(kind="bar", figsize=(8,4), width = 0.8, edgecolor=None, color="steelblue") plt.ylabel("Respondents %") plt.title("Respondents % by Country: Total {}".format(df.Country.dropna().shape[0])); plt.xticks(fontsize=10) nb_respondents = list(countries_vals[:15]) utils.display_values_above_bars(ax, nb_respondents) plt.show() df_eu_respondents = df[df.Country.isin(eu_countries.x)] df_eu_respondents.Country.unique() total_eu_respondents = df_eu_respondents.Country.shape[0] print("Total respondents in the USA", countries_vals["United States"]) print("Total respondents in EU", total_eu_respondents) # sanity check df.Country.value_counts().sum() # ## Number of survey respondents from the EU coutries # In[11]: ax_eu = (df_eu_respondents.Country.value_counts() * 100 / df_eu_respondents.Country.dropna().shape[0]).plot( kind="bar", figsize=(12,4), width = 0.8, edgecolor=None); # plt.axhline(y=0.05,color='gray',linestyle='--') # plt.axhline(y=0.1,color='gray',linestyle='--') plt.ylabel("Respondents %") plt.title("Respondents % from EU countries"); utils.display_values_above_bars(ax_eu, list(df_eu_respondents.Country.value_counts())) plt.show() # # LanguageWorkedWith # In[12]: df.LanguageWorkedWith.head() # In[13]: languages = df['LanguageWorkedWith'].dropna().apply(lambda row: str(row).split(';')).explode().unique() languages_worked_with = df['LanguageWorkedWith'].dropna() language_responses = languages_worked_with.count() languages_perc = {} languages_count = {} for language in languages: languages_perc[language] = df["LanguageWorkedWith"].str.contains( language, regex=False, na=False).sum() / languages_worked_with.shape[0] languages_count[language] = df["LanguageWorkedWith"].str.contains( language, regex=False, na=False).sum() languages_perc = utils.sort_dict_by_vals(languages_perc) languages_count = utils.sort_dict_by_vals(languages_count) plt.title("Programming, Scripting, and Markup Languages: {} Respondents".format(df["LanguageWorkedWith"].dropna().shape[0])) plt.xticks(rotation='vertical') plt.ylabel("Repondents %") plt.bar(languages_perc.keys(), languages_perc.values(), width=0.8, edgecolor=None); # # Developer Type: Which of the following describe you? # In[14]: devtypes = df['DevType'].dropna() devtypes # In[15]: devtypes_list = devtypes.apply(lambda row: str(row).split(';')).explode().unique() print("DevTypes list", devtypes_list) # In[16]: not_na_devtypes = df['DevType'].dropna() not_na_devtypes # In[17]: devtypes_list = not_na_devtypes.apply(lambda row: str(row).split(';')).explode().unique() print("DevTypes list", devtypes_list) # In[18]: # Get the developer type without the environment (after the comma e.g. Developer, desktop or enterprise applications) devtypes = set(dev.split(',')[0].strip() for dev in devtypes_list) devtype_count, devtype_perc = utils.count_substr_in_column(df, "DevType", devtypes, False) plt.xticks(rotation='vertical') plt.xlabel("Developers %") plt.title("% of Respondents by Developer Type") y_pos = np.arange(len(devtype_perc)) ax_devs = plt.barh(y_pos, devtype_perc.values()); plt.yticks(y_pos, devtype_perc.keys()) plt.show() print("\nNumber of respondents: ", df["DevType"].dropna().shape[0]) print("\nDev Types:", devtype_count) top8_devtypes = list(devtype_perc.keys())[:8] # # Developers perception of the OSS quality # In[19]: os_perception = (df.OpenSource.value_counts() * 100 / df.OpenSource.shape[0]).sort_values() os_perception.plot(kind="barh"); plt.title("Devleopers perception of the OSS quality") plt.xlabel("Developers %") plt.yticks(rotation=0, size=8) plt.get("Developers %"); # # OpenSource contribution: # The column OpenSource doesn't contain any missing value. All Respondents answered the corresponding question: # How often do you contribute to open source? # # 3. Evaluating results and sharing insights # # Question 1 # How often do developers contribute to OpenSource Software? # In[20]: nb_opnesourcers = df.OpenSourcer.count() opensourcers_count = df.OpenSourcer.value_counts() colors = ['#ff9999','cornflowerblue','#ffcc99', '#99ff99'] utils.plot_and_show_pie(opensourcers_count.index, opensourcers_count.values, "Contribution Frequency to OSS: {} respondents".format(df.Respondent.count()), 3, colors=colors); # In[21]: df.OpenSourcer.replace({ "Less than once a month but more than once per year": "Less than once a month\nbut more than once per year"}, inplace=True) df.OpenSourcer.value_counts() * 100 / df.shape[0] # In[22]: colors_list = ['lightgray', 'lightsteelblue', 'mediumseagreen', 'steelblue', 'cornflowerblue', 'cornflowerblue'] ax_oss_contributors = (df.OpenSourcer.value_counts() / df.shape[0]).plot( kind="bar", edgecolor=None, figsize=(6,4), color=colors_list); plt.ylabel("Respondents %") plt.xticks(rotation=30, ha="right") ax_oss_contributors.set_xticklabels(list(df.OpenSourcer.unique())) plt.title("Percentage of respondents contributing to OSS:\n total {}".format(df.OpenSourcer.value_counts().sum())); utils.display_values_above_bars(ax_eu, list(df.OpenSourcer.value_counts())) ax_oss_contributors.get_xticklabels()[2].set_color("green") ax_oss_contributors.get_xticklabels()[0].set_color("gray") plt.show() # # Question 2 # Do Hobbyist developers contribute more often to Open Source projects? # In[23]: df.Hobbyist.unique() # In[24]: df[df.OpenSourcer == "Never"].groupby("Hobbyist")["Respondent"].count() # In[25]: hobbyist_opensourcer = df.filter(["Respondent", "OpenSourcer", "Hobbyist"], axis=1).groupby(['OpenSourcer', 'Hobbyist']).count() hobbyist_opensourcer # In[26]: hobbyist_opensourcer = hobbyist_opensourcer.reset_index() hobbyist_opensourcer # In[27]: hobbyist_opensourcer['Hobbyist'].replace({"No": "Don't code as a Hobby", "Yes": "Code as a Hobby"}, inplace=True) splot = utils.plot_grouped_bars(hobbyist_opensourcer, "OpenSourcer", "Hobbyist", "Respondent", title="Do Hobbyist contribute more to OSS?", xlabel="", ylabel="# Respondents"); splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1); # # Question 3 # Does OSS quality perception play a bias role towards OSS contribution # In[28]: group_oss_quality_hobbyist = df[df.Hobbyist == 'Yes'].filter(["Respondent", "OpenSourcer", "OpenSource"], axis=1).groupby(['OpenSourcer', 'OpenSource']).count() group_oss_quality_hobbyist # In[29]: group_oss_quality_nothobbyist = df[df.Hobbyist == 'No'].filter(["Respondent", "OpenSourcer", "OpenSource"], axis=1).groupby(['OpenSourcer', 'OpenSource']).count() oss_quality_perception_groups = [group_oss_quality_hobbyist, group_oss_quality_nothobbyist] group_oss_quality_nothobbyist # In[30]: group_oss_quality_hobbyist = group_oss_quality_hobbyist.reset_index() group_oss_quality_nothobbyist = group_oss_quality_nothobbyist.reset_index() # In[31]: nr_rows = 1 nr_cols = 2 fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*7,nr_rows*5), squeeze=False, sharex=False) splot1 = utils.plot_grouped_bars(group_oss_quality_hobbyist, "OpenSourcer", "OpenSource", "Respondent", ylabel="# Respondents", ax=axs[0][0], title="Hobbyists", legend=False); splot2 = utils.plot_grouped_bars(group_oss_quality_nothobbyist, "OpenSourcer", "OpenSource", "Respondent", ylabel="# Respondents", ax=axs[0][1], title="not Hobbyists", legend=False); plt.suptitle('Is the OSS quality perception biasing the contribution frequency?', size=14) plt.legend(loc='center left', bbox_to_anchor=(-0.7, -0.5), ncol=1); plt.subplots_adjust(top=.86) plt.show() # # Question # Does the number of years of experience influence the opensource contribution frequency? # # Coding Years of experience including Education # In[32]: # Check the years of experience unique values df.YearsCodePro.unique() # ### Map string values of YearsCodePro to numbers to use later for years of experience groups # In[33]: value_map = {'Less than 1 year': '0', 'More than 50 years': '51'} def mapper(val): """ Mapping a string value to an int :param val(string) value to map """ return int(value_map.get(val, val)) df['YearsCodeProCleaned'] = df['YearsCodePro'].dropna().apply(mapper) df.YearsCodeProCleaned.unique() # ### Create years of experience ranges # In[34]: tmp = [0, 5, 10, 20, 40, 55] ranges = list(zip(tmp, tmp[1:])) ranges def in_the_range(ranges_): """ Map value to a range if it is inside it """ def f(x): i = 0 while i < len(ranges_): r = ranges[i] if (x >= r[0]) and (x < r[1]): if r[0] == 40: return '>40' else: return f'{r[0]} - {r[1]}' i += 1 return f'>{r[1]}' return f # In[35]: df['Years of experience'] = (df['YearsCodeProCleaned'].dropna()).apply(in_the_range(ranges)) df['Years of experience'] # In[36]: (df['Years of experience'].value_counts()/ df["Years of experience"].count()).plot( kind="bar", title="Coding Years of experience including Education"); # ### Group the respondents by Years of Experience ranges and OSS contribution frequency # In[37]: # df.loc[df['YearsCodeProCleaned'] < 5, 'YearsCodeProCleaned'] = "<5" opensourcers_by_age = df.filter(["Respondent", "OpenSourcer", "Years of experience"], axis=1).\ groupby(["Years of experience", "OpenSourcer"]).count() opensourcers_by_age # In[38]: opensourcers_by_age = opensourcers_by_age.reset_index() # In[39]: splot = utils.plot_grouped_bars(opensourcers_by_age, "OpenSourcer", "Years of experience", "Respondent", xlabel="", ylabel="# Respondents"); plt.title("Frequency of contribution to OSS and years of experience groups", size=13, fontweight='light') splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1); plt.show() # # Question 5 # Do developers contributing to the OSS have a higher income? # In[40]: # Summarize the central tendency, dispersion and shape of the dataset’s distribution, excluding NaN values. df.CompTotal.describe() # In[41]: median_salaries = np.median(df.CompTotal.dropna().values) median_salaries # In[42]: # Kurtosis to measure whether the distribution is too peaked. df.kurtosis(axis = 0, skipna = True) # In[43]: # Outliers are massively skewing the data. Removing them to get a better interpretation of the remaining data sns.distplot(df.query('CompTotal<{}'.format(median_salaries*20)).CompTotal.dropna()) #, bins=10000); plt.xlabel("Salary") plt.ylabel("Density") plt.title("Developers salary distribution"); # In[44]: salary_data = df.filter(["Respondent", "OpenSourcer", "Hobbyist" , "CompTotal"], axis=1) salary_data = salary_data[salary_data.CompTotal < median_salaries * 20] salary_data.shape[0] # In[45]: # Number of Data points with salary outliers df.CompTotal.count() # In[46]: # Number of Salary Data points without outliers (df.CompTotal.dropna().values < median_salaries * 20).sum() # In[47]: salary_data = salary_data.groupby(['OpenSourcer']).agg(CompTotal =("CompTotal",'mean')) # data.groupby(['OpenSourcer', 'Hobbyist']).agg(mean_salary =("CompTotal",'mean')) # groupby('StationID', as_index=False)['BiasTemp'].mean() salary_data["mean_salary_formatted"] = salary_data["CompTotal"].apply(lambda x: '{:,.2f}'.format(x)).values.tolist() salary_data = salary_data.reset_index() # salary_data['Hobbyist'].replace({"No": "Don't code as a Hobby", # "Yes": "Code as a Hobby"}, inplace=True) salary_data # In[48]: plt.figure(figsize=(5, 4)) splot = sns.barplot(x="OpenSourcer", y="CompTotal", data=salary_data, palette="Blues_d", capsize=0.1) # utils.plot_grouped_bars(salary_data, "OpenSourcer", "Hobbyist", "CompTotal", # title="What is the average salary of Open Sourcers?", # xlabel="", ylabel="# Respondents") # splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1); plt.ylabel("Mean Salary", size=8, fontweight='light') plt.xlabel("Freq. of contribution to OSS") plt.xticks(rotation=50, size=8) # plt.tight_layout() plt.title("What is the average salary of Open Sourcers?", size=11, fontweight='light') for p in splot.patches: splot.annotate(format(round(p.get_height() / 1000), '.0f') + "K", (p.get_x()+0.3, p.get_height()), ha='center', va='center', size=10, xytext=(0, -12), textcoords='offset points', color='black') plt.show() # In[ ]: # In[ ]: