#!/usr/bin/env python
# coding: utf-8

# # Summary Analysis of the 2017 GitHub Open Source Survey
# 
# By R. Stuart Geiger ([@staeiou](https://github.com/staeiou/)), Berkeley Institute for Data Science
# 
# 
# ## Overview
# 
# This notebook analyzes the 2017 Open Source Survey, conducted by staff at GitHub, Inc. and other collaborators (see https://opensourcesurvey.org/2017 and https://github.com/github/open-source-survey). The survey was run in 2017, asking over 50 questions on a variety of topics. The survey's designers explain the motivation, design, and distribution of the survey:
# 
# In collaboration with researchers from academia, industry, and the community, GitHub designed a survey to gather high quality and novel data on open source software development practices and communities. We collected responses from 5,500 randomly sampled respondents sourced from over 3,800 open source repositories on GitHub.com, and over 500 responses from a non-random sample of communities that work on other platforms. The results are an open data set about the attitudes, experiences, and backgrounds of those who use, build, and maintain open source software."
# 
# ## Purpose and goal
# 
# The GitHub survey team presented analyses of some questions when releasing the survey, but there were many more questions asked that are relevant to researchers and community members. This report is an exploratory analysis of all questions asked in the survey, providing a basic summary of the responses to each question. This report presents and plots summary statistics -- mostly frequency counts, proportions, then a frequency or proportion bar graph -- of all questions asked in the survey. Most questions are presented individually, with panel questions grouped together as appropriate. There are no correlations, regressions, or descriptive breakouts between subgroups. Likert-style questions (e.g. Strongly agree <-> strongly disagree) have not been recoded to numerical, scalar values. There are no discussions or interpretations of results. This is left for future work.
# 
# The purpose of this notebook is to facilitate future research on this dataset by giving an overview of the kinds of questions asked in the survey, as well as serve as the basis for a PDF report, published on SocArXiv and OSF at https://osf.io/preprints/socarxiv/qps53/. The notebook is public on GitHub at https://github.com/staeiou/github-survey-analysis and others are encouraged to extend it as they see fit. 

# In[1]:


get_ipython().system('pip install pandas seaborn')


# In[2]:


import pandas as pd
import matplotlib, matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

get_ipython().run_line_magic('matplotlib', 'inline')
pd.options.display.float_format = '{:.2f}%'.format # add % to all floats, all floats here are percentages


# In[3]:


## For making pretty tables when nbconverting to latex

pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame


# ## Download and unzip data

# In[4]:


get_ipython().system('unzip -o data_for_public_release.zip')


# In[5]:


get_ipython().system('ls data_for_public_release/')


# # Data processing
# ## Main dataset

# ### Load main dataset into pandas

# In[6]:


pd.options.display.max_rows = 500


# In[7]:


survey_df = pd.read_csv("data_for_public_release/survey_data.csv")


# In[8]:


print("survey_data.csv length:", len(survey_df))


# In[9]:


survey_complete_df = survey_df.query("STATUS == 'Complete'")
print("survey_data.csv completed responses:", len(survey_complete_df))


# ### Explore the main dataset with some sample responses

# In[10]:


survey_complete_df[0:3].transpose()


# ### Create lists of variables for bulk analysis

# In[11]:


participation_type_vars = ['PARTICIPATION.TYPE.FOLLOW',
       'PARTICIPATION.TYPE.USE.APPLICATIONS',
       'PARTICIPATION.TYPE.USE.DEPENDENCIES', 'PARTICIPATION.TYPE.CONTRIBUTE',
       'PARTICIPATION.TYPE.OTHER']

contrib_type_vars = ['CONTRIBUTOR.TYPE.CONTRIBUTE.CODE',
       'CONTRIBUTOR.TYPE.CONTRIBUTE.DOCS',
       'CONTRIBUTOR.TYPE.PROJECT.MAINTENANCE', 'CONTRIBUTOR.TYPE.FILE.BUGS',
       'CONTRIBUTOR.TYPE.FEATURE.REQUESTS', 'CONTRIBUTOR.TYPE.COMMUNITY.ADMIN']

contrib_other_vars = ['EMPLOYMENT.STATUS', 'PROFESSIONAL.SOFTWARE',
       'FUTURE.CONTRIBUTION.INTEREST', 'FUTURE.CONTRIBUTION.LIKELIHOOD']

contrib_ident_vars = participation_type_vars + contrib_type_vars + contrib_other_vars


# In[12]:


user_pri_vars = ['OSS.USER.PRIORITIES.LICENSE', 'OSS.USER.PRIORITIES.CODE.OF.CONDUCT',
       'OSS.USER.PRIORITIES.CONTRIBUTING.GUIDE', 'OSS.USER.PRIORITIES.CLA',
       'OSS.USER.PRIORITIES.ACTIVE.DEVELOPMENT',
       'OSS.USER.PRIORITIES.RESPONSIVE.MAINTAINERS',
       'OSS.USER.PRIORITIES.WELCOMING.COMMUNITY',
       'OSS.USER.PRIORITIES.WIDESPREAD.USE']

contrib_pri_vars = ['OSS.CONTRIBUTOR.PRIORITIES.LICENSE',
       'OSS.CONTRIBUTOR.PRIORITIES.CODE.OF.CONDUCT',
       'OSS.CONTRIBUTOR.PRIORITIES.CONTRIBUTING.GUIDE',
       'OSS.CONTRIBUTOR.PRIORITIES.CLA',
       'OSS.CONTRIBUTOR.PRIORITIES.ACTIVE.DEVELOPMENT',
       'OSS.CONTRIBUTOR.PRIORITIES.RESPONSIVE.MAINTAINERS',
       'OSS.CONTRIBUTOR.PRIORITIES.WELCOMING.COMMUNITY',
       'OSS.CONTRIBUTOR.PRIORITIES.WIDESPREAD.USE']

oss_values_vars = [ 'SEEK.OPEN.SOURCE',
       'OSS.UX', 'OSS.SECURITY', 'OSS.STABILITY', 'INTERNAL.EFFICACY',
       'EXTERNAL.EFFICACY', 'OSS.IDENTIFICATION']

user_values_vars = ['USER.VALUES.STABILITY',
       'USER.VALUES.INNOVATION', 'USER.VALUES.REPLICABILITY',
       'USER.VALUES.COMPATIBILITY', 'USER.VALUES.SECURITY', 'USER.VALUES.COST',
       'USER.VALUES.TRANSPARENCY', 'USER.VALUES.USER.EXPERIENCE',
       'USER.VALUES.CUSTOMIZABILITY', 'USER.VALUES.SUPPORT',
       'USER.VALUES.TRUSTED.PRODUCER']

values_pri_vars = user_pri_vars + contrib_pri_vars + user_values_vars + oss_values_vars 


# In[13]:


privacy_transp_vars = ['TRANSPARENCY.PRIVACY.BELIEFS',
       'INFO.AVAILABILITY', 'INFO.JOB',
       'TRANSPARENCY.PRIVACY.PRACTICES.GENERAL',
       'TRANSPARENCY.PRIVACY.PRACTICES.OSS']


# In[14]:


help_vars = ['RECEIVED.HELP', 'FIND.HELPER',
       'HELPER.PRIOR.RELATIONSHIP', 'RECEIVED.HELP.TYPE', 'PROVIDED.HELP',
       'FIND.HELPEE', 'HELPEE.PRIOR.RELATIONSHIP', 'PROVIDED.HELP.TYPE']


# In[15]:


paid_work_vars = ['OSS.AS.JOB',
       'OSS.AT.WORK', 'OSS.IP.POLICY', 'EMPLOYER.POLICY.APPLICATIONS',
       'EMPLOYER.POLICY.DEPENDENCIES', 'OSS.HIRING']


# In[16]:


discouraging_vars = ['DISCOURAGING.BEHAVIOR.LACK.OF.RESPONSE',
       'DISCOURAGING.BEHAVIOR.REJECTION.WOUT.EXPLANATION',
       'DISCOURAGING.BEHAVIOR.DISMISSIVE.RESPONSE',
       'DISCOURAGING.BEHAVIOR.BAD.DOCS', 'DISCOURAGING.BEHAVIOR.CONFLICT',
       'DISCOURAGING.BEHAVIOR.UNWELCOMING.LANGUAGE']


# In[17]:


demographic_vars = ['IMMIGRATION',
       'MINORITY.HOMECOUNTRY', 'MINORITY.CURRENT.COUNTRY', 'GENDER',
       'TRANSGENDER.IDENTITY', 'SEXUAL.ORIENTATION', 'WRITTEN.ENGLISH', 'AGE',
       'FORMAL.EDUCATION', 'PARENTS.FORMAL.EDUCATION',
       'AGE.AT.FIRST.COMPUTER.INTERNET', 'LOCATION.OF.FIRST.COMPUTER.INTERNET',
       'PARTICIPATION.TYPE.ANY.REPONSE', 'POPULATION', 'OFF.SITE.ID',
       'TRANSLATED']


# In[18]:


survey_vars = [contrib_ident_vars, values_pri_vars, privacy_transp_vars, \
               help_vars, paid_work_vars, discouraging_vars, demographic_vars]


# ## Negative incidents
# ### Load into pandas

# In[19]:


neg_df = pd.read_csv("data_for_public_release/negative_incidents.csv")


# In[20]:


print("negative_incidents.csv length:", len(survey_df))


# ### Explore the negative dataset with some sample responses

# In[21]:


neg_df[0:3].transpose()


# ### Create lists of variables for bulk analysis

# In[22]:


neg_witness_vars = ['NEGATIVE.WITNESS.RUDENESS', 'NEGATIVE.WITNESS.NAME.CALLING',
       'NEGATIVE.WITNESS.THREATS', 'NEGATIVE.WITNESS.IMPERSONATION',
       'NEGATIVE.WITNESS.SUSTAINED.HARASSMENT',
       'NEGATIVE.WITNESS.CROSS.PLATFORM.HARASSMENT',
       'NEGATIVE.WITNESS.STALKING', 'NEGATIVE.WITNESS.SEXUAL.ADVANCES',
       'NEGATIVE.WITNESS.STEREOTYPING', 'NEGATIVE.WITNESS.DOXXING',
       'NEGATIVE.WITNESS.OTHER', 'NEGATIVE.WITNESS.NONE.OF.THE.ABOVE', 'NEGATIVE.WITNESS.ANY.RESPONSE']


# In[23]:


neg_exp_vars = ['NEGATIVE.EXPERIENCE.RUDENESS', 'NEGATIVE.EXPERIENCE.NAME.CALLING',
       'NEGATIVE.EXPERIENCE.THREATS', 'NEGATIVE.EXPERIENCE.IMPERSONATION',
       'NEGATIVE.EXPERIENCE.SUSTAINED.HARASSMENT',
       'NEGATIVE.EXPERIENCE.CROSS.PLATFORM.HARASSMENT',
       'NEGATIVE.EXPERIENCE.STALKING', 'NEGATIVE.EXPERIENCE.SEXUAL.ADVANCES',
       'NEGATIVE.EXPERIENCE.STEREOTYPING', 'NEGATIVE.EXPERIENCE.DOXXING',
       'NEGATIVE.EXPERIENCE.OTHER', 'NEGATIVE.EXPERIENCE.NONE.OF.THE.ABOVE', 'NEGATIVE.EXPERIENCE.ANY.RESPONSE']


# In[24]:


neg_resp_vars = ['NEGATIVE.RESPONSE.ASKED.USER.TO.STOP',
       'NEGATIVE.RESPONSE.SOLICITED.COMMUNITY.SUPPORT',
       'NEGATIVE.RESPONSE.BLOCKED.USER',
       'NEGATIVE.RESPONSE.REPORTED.TO.MAINTAINERS',
       'NEGATIVE.RESPONSE.REPORTED.TO.HOST.OR.ISP',
       'NEGATIVE.RESPONSE.CONSULTED.LEGAL.COUNSEL',
       'NEGATIVE.RESPONSE.CONTACTED.LAW.ENFORCEMENT',
       'NEGATIVE.RESPONSE.OTHER', 'NEGATIVE.RESPONSE.IGNORED', 'NEGATIVE.RESPONSE.ANY.RESPONSE']


# In[25]:


neg_effect_vars = ['RESPONSE.EFFECTIVENESS.ASKED.USER.TO.STOP',
       'RESPONSE.EFFECTIVENESS.SOLICITED.COMMUNITY.SUPPORT',
       'RESPONSE.EFFECTIVENESS.BLOCKED.USER',
       'RESPONSE.EFFECTIVENESS.REPORTED.TO.MAINTAINERS',
       'RESPONSE.EFFECTIVENESS.REPORTED.TO.HOST.OR.ISP',
       'RESPONSE.EFFECTIVENESS.CONSULTED.LEGAL.COUNSEL',
       'RESPONSE.EFFECTIVENESS.CONTACTED.LAW.ENFORCEMENT',
       'RESPONSE.EFFECTIVENESS.OTHER']


# In[26]:


neg_conseq_vars = ['NEGATIVE.CONSEQUENCES.STOPPED.CONTRIBUTING',
       'NEGATIVE.CONSEQUENCES.PSEUDONYM',
       'NEGATIVE.CONSEQUENCES.WORK.IN.PRIVATE',
       'NEGATIVE.CONSEQUENCES.CHANGE.USERNAME',
       'NEGATIVE.CONSEQUENCES.CHANGE.ONLINE.PRESENCE',
       'NEGATIVE.CONSEQUENCES.SUGGEST.COC',
       'NEGATIVE.CONSEQUENCES.PRIVATE.COMMUNITY.DISCUSSION',
       'NEGATIVE.CONSEQUENCES.PUBLIC.COMMUNITY.DISCUSSION',
       'NEGATIVE.CONSEQUENCES.OFFLINE.CHANGES', 'NEGATIVE.CONSEQUENCES.OTHER',
       'NEGATIVE.CONSEQUENCES.NONE.OF.THE.ABOVE', 'NEGATIVE.CONSEQUENCES.ANY.RESPONSE']


# In[27]:


neg_anyresp_vars = ['NEGATIVE.WITNESS.ANY.RESPONSE', 'NEGATIVE.EXPERIENCE.ANY.RESPONSE',
       'NEGATIVE.RESPONSE.ANY.RESPONSE', 'NEGATIVE.CONSEQUENCES.ANY.RESPONSE']


# # Analysis

# In[28]:


sns.set(font_scale=1.5)


# ## Contributor identity

# ### People participate in open source in different ways. Which of the following activities do you engage in?
# 
# Choose all that apply.

# In[29]:


participation_type_resp= survey_df[participation_type_vars].apply(pd.Series.value_counts).transpose()
participation_type_resp.columns = ["No", "Yes"]
participation_type_resp


# In[ ]:


# In[30]:


participation_type_prop = survey_df[participation_type_vars].mean() * 100
participation_type_prop = participation_type_prop.sort_values()
pd.DataFrame(participation_type_prop, columns=["percent"])


# In[31]:


ax = participation_type_prop.plot(kind='barh')

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[19:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
plt.xlim(0,100)
ax.set_yticklabels(labels)

ax.set_xlabel("Percent of respondents")
t = plt.title("% of people who participate in the following activities:")


# ### Contributon type: How often do you engage in each of the following activities?

# In[32]:


contrib_type_responses = survey_df[contrib_type_vars].apply(pd.Series.value_counts).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
contrib_type_responses = contrib_type_responses[["Never", "Rarely", "Occasionally", "Frequently"]]
contrib_type_responses = contrib_type_responses[["Frequently", "Occasionally", "Rarely", "Never"]]
contrib_type_responses = contrib_type_responses.sort_values(by='Frequently')
contrib_type_responses


# In[33]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
contrib_type_responses.plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[17:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("How often do you engage in each of the following activities?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### Employment status
# EMPLOYMENT.STATUS

# In[34]:


prop_df = pd.DataFrame((survey_df['EMPLOYMENT.STATUS'].value_counts()))
prop_df.columns=["count"]
prop_df


# In[35]:


prop_df = pd.DataFrame((survey_df['EMPLOYMENT.STATUS'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[36]:


ax = pd.DataFrame(survey_df['EMPLOYMENT.STATUS'].value_counts()).plot(kind='barh')
plt.suptitle("Employment status")
t = ax.set_xlabel("Count of responses")


# ### In your main job, how often do you write or otherwise directly contribute to producing software?
# PROFESSIONAL.SOFTWARE

# In[37]:


prop_df = pd.DataFrame((survey_df['PROFESSIONAL.SOFTWARE'].value_counts()))
prop_df.columns=["count"]
prop_df


# In[38]:


prop_df = pd.DataFrame((survey_df['PROFESSIONAL.SOFTWARE'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[39]:


ax = pd.DataFrame(survey_df['PROFESSIONAL.SOFTWARE'].value_counts()).plot(kind='barh')
plt.title("In your main job, how often do you write or\notherwise directly contribute to producing software?")
t = ax.set_xlabel("Count of responses")


# ### How interested are you in contributing to open source projects in the future?
# FUTURE.CONTRIBUTION.INTEREST

# In[40]:


prop_df = pd.DataFrame((survey_df['FUTURE.CONTRIBUTION.INTEREST'].value_counts()))
prop_df.columns=["count"]
prop_df


# In[41]:


prop_df = pd.DataFrame((survey_df['FUTURE.CONTRIBUTION.INTEREST'].value_counts(normalize=True).round(4).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[42]:


ax = pd.DataFrame(survey_df['FUTURE.CONTRIBUTION.INTEREST'].value_counts()).plot(kind='barh')
plt.title("How interested are you in contributing\nto open source projects in the future?")
t = ax.set_xlabel("Count of responses")


# ### How likely are you to contribute to open source projects in the future?
# 
# 

# In[43]:


prop_df = pd.DataFrame((survey_df['FUTURE.CONTRIBUTION.LIKELIHOOD'].value_counts()))
prop_df.columns=["count"]
prop_df


# In[44]:


prop_df = pd.DataFrame((survey_df['FUTURE.CONTRIBUTION.LIKELIHOOD'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[45]:


ax = pd.DataFrame(survey_df['FUTURE.CONTRIBUTION.LIKELIHOOD'].value_counts()).plot(kind='barh')
plt.title("How likely are you to contribute to\nopen source projects in the future?")
t = ax.set_xlabel("Count of responses")


# ## Priorities and values

# ### When thinking about whether to use open source software, how important are the following things?
# OSS.USER.PRIORITIES.*

# In[46]:


user_pri_responses = survey_df[user_pri_vars].apply(pd.Series.value_counts).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
user_pri_responses = user_pri_responses[["Very important to have",
                                             "Somewhat important to have",
                                             "Not important either way",
                                             "Somewhat important not to have",
                                             "Very important not to have",
                                             "Don't know what this is"]]
user_pri_responses = user_pri_responses.sort_values(by="Very important to have")


# In[47]:


idx = []
for i in user_pri_responses.index:
    idx.append(i[20:])
idx = pd.Series(idx)    
user_pri_responses.set_index(idx)


# In[48]:


user_pri_responses_prop = survey_df[user_pri_vars].apply(pd.Series.value_counts, normalize=True).round(4).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
user_pri_responses_prop = user_pri_responses_prop[["Very important to have",
                                             "Somewhat important to have",
                                             "Not important either way",
                                             "Somewhat important not to have",
                                             "Very important not to have",
                                             "Don't know what this is"]]
user_pri_responses_prop = user_pri_responses_prop.sort_values(by="Very important to have")
user_pri_responses_prop = user_pri_responses_prop * 100


# In[49]:


idx = []
for i in user_pri_responses_prop.index:
    idx.append(i[20:])
idx = pd.Series(idx)    
user_pri_responses_prop.set_index(idx)


# In[50]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.coolwarm
colors = ["xkcd:darkblue", "xkcd:lightblue", "xkcd:beige", "xkcd:salmon", "xkcd:crimson", "xkcd:green"]
user_pri_responses.plot.barh(stacked=True, ax=ax, figsize=[12,8], color=colors)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[20:].replace(".", " ") # cut off "OSS.USER.PRIORITIES."
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)

plt.title("When thinking about whether to *use* open source software,\n how important are the following things?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.1), ncol=2, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### When thinking about whether to contribute to an open source project, how important are the following things?
# OSS.CONTRIBUTOR.PRIORITIES.*

# In[51]:


contrib_pri_responses = survey_df[contrib_pri_vars].apply(pd.Series.value_counts).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
contrib_pri_responses = contrib_pri_responses[["Very important to have",
                                             "Somewhat important to have",
                                             "Not important either way",
                                             "Somewhat important not to have",
                                             "Very important not to have",
                                             "Don't know what this is"]]

contrib_pri_responses = contrib_pri_responses.sort_values(by="Very important to have")


# In[52]:


idx = []
for i in contrib_pri_responses.index:
    idx.append(i[27:])
idx = pd.Series(idx)    
contrib_pri_responses.set_index(idx)


# In[ ]:


# In[53]:


contrib_pri_responses_prop = survey_df[contrib_pri_vars].apply(pd.Series.value_counts, normalize=True).round(4).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
contrib_pri_responses_prop = contrib_pri_responses_prop[["Very important to have",
                                             "Somewhat important to have",
                                             "Not important either way",
                                             "Somewhat important not to have",
                                             "Very important not to have",
                                             "Don't know what this is"]]
contrib_pri_responses_prop = contrib_pri_responses_prop.sort_values(by="Very important to have")
contrib_pri_responses_prop = contrib_pri_responses_prop * 100


# In[54]:


idx = []
for i in contrib_pri_responses_prop.index:
    idx.append(i[27:])
idx = pd.Series(idx)    
contrib_pri_responses_prop.set_index(idx)


# In[55]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.coolwarm
colors = ["xkcd:darkblue", "xkcd:lightblue", "xkcd:beige", "xkcd:salmon", "xkcd:crimson", "xkcd:green"]
contrib_pri_responses.plot.barh(stacked=True, ax=ax, figsize=[12,8], color=colors)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[27:].replace(".", " ") # cut off "OSS.USER.PRIORITIES."
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)

plt.title("When thinking about whether to *contribute* to an open source project,\nhow important are the following things?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.1), ncol=2, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### How often do you try to find open source options over other kinds of software?
# SEEK.OPEN.SOURCE

# In[56]:


count_df = pd.DataFrame(data=survey_df['SEEK.OPEN.SOURCE'].value_counts())
count_df.columns = ["count"]
count_df


# In[57]:


prop_df = pd.DataFrame((survey_df['SEEK.OPEN.SOURCE'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[58]:


ax = pd.DataFrame(survey_df['SEEK.OPEN.SOURCE'].value_counts()).plot(kind='barh')
plt.title("How often do you try to find open\nsource options over other kinds of software?")
t = ax.set_xlabel("Count of responses")


# ### Open source software usability
# 
# OSS.UX: Do you believe that open source software is generally easier to use than closed source (proprietary) software, harder to use, or about the same?
# 

# In[59]:


count_df = pd.DataFrame(data=survey_df['OSS.UX'].value_counts())
count_df.columns = ["count"]
count_df


# In[60]:


prop_df = pd.DataFrame((survey_df['OSS.UX'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[61]:


ax = pd.DataFrame(survey_df['OSS.UX'].value_counts()).plot(kind='barh')
plt.title("Do you believe that open source software is generally\neasier to use than closed source (proprietary)\nsoftware, harder to use, or about the same?")
t = ax.set_xlabel("Count of responses")


# ### Open source software security
# 
# OSS.SECURITY: Do you believe that open source software is generally more secure than closed source (proprietary) software, less secure, or about the same?

# In[62]:


count_df = pd.DataFrame(data=survey_df['OSS.SECURITY'].value_counts())
count_df.columns = ["count"]
count_df


# In[63]:


prop_df = pd.DataFrame((survey_df['OSS.SECURITY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[64]:


ax = pd.DataFrame(survey_df['OSS.SECURITY'].value_counts()).plot(kind='barh')
plt.title("Do you believe that open source software is\ngenerally more secure than closed source (proprietary)\nsoftware, less secure, or about the same?")
t = ax.set_xlabel("Count of responses")


# ### Open source software stability
# 
# OSS.STABILITY: Do you believe that open source software is generally more stable than closed source (proprietary) software, less stable, or about the same?

# In[65]:


count_df = pd.DataFrame(data=survey_df['OSS.STABILITY'].value_counts())
count_df.columns = ["count"]
count_df


# In[66]:


prop_df = pd.DataFrame((survey_df['OSS.STABILITY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[67]:


pd.DataFrame(survey_df['OSS.STABILITY'].value_counts()).plot(kind='barh')
plt.title("Do you believe that open source software is\ngenerally more stable than closed source\n(proprietary), less stable, or about the same?")
t = ax.set_xlabel("Count of responses")


# In[ ]:


# ### Identification with open source
# 
# How much do you agree or disagree with the following statements:
# 
# - EXTERNAL.EFFICACY: The open source community values contributions from people like me.
# - INTERNAL.EFFICACY: I have the skills and understanding necessary to make meaningful contributions to open source projects.
# - OSS.IDENTIFICATION: I consider myself to be a member of the open source (and/or the Free/Libre software) community.

# In[68]:


oss_id_vars = ["INTERNAL.EFFICACY", "EXTERNAL.EFFICACY", "OSS.IDENTIFICATION"]


# In[69]:


oss_id_responses = survey_df[oss_id_vars].apply(pd.Series.value_counts).transpose()

#contrib_type_responses.columns = ["Not at all important", "Slightly important","Don't know", "Somewhat important", "Very important"]
oss_id_responses = oss_id_responses[["Strongly agree",
                                     "Somewhat agree",
                                     "Neither agree nor disagree",
                                     "Somewhat disagree",
                                     "Strongly disagree"]]
oss_id_responses = oss_id_responses.sort_values(by="Strongly agree")
oss_id_responses


# In[70]:


oss_id_responses_prop = survey_df[oss_id_vars].apply(pd.Series.value_counts, normalize=True).round(4) * 100 
oss_id_responses_prop.transpose()


# In[71]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.coolwarm
colors = ["xkcd:darkblue", "xkcd:lightblue", "xkcd:beige", "xkcd:salmon", "xkcd:crimson"]
oss_id_responses.plot.barh(stacked=True, ax=ax, figsize=[12,5], cmap=matplotlib.cm.coolwarm, edgecolor='black', linewidth=1)

#print(str(ax.get_yticklabels()))

ax.set_yticklabels(["The open source community values\ncontributions from people like me.",
                    "I consider myself to be a member\nof the open source (and/or the\nFree/Libre software) community.",
                    "I have the skills and understanding\nnecessary to make meaningful\ncontributions to open source projects."])


plt.title("How much do you agree or disagree with the following statements:")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.25), ncol=2, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ## Transparency vs privacy

# ### Attribution
# 
# TRANSPARENCY.PRIVACY.BELIEFS: Which of the following statements is closest to your beliefs about attribution in software development?
# 
# - Records of authorship should be required so that end users know who created the source code they are working with. 
# - People should be able to contribute code without attribution, if they wish to remain anonymous.

# In[72]:


counts_df = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.BELIEFS'].value_counts())
counts_df.columns=["count"]
counts_df


# In[73]:


prop_df = pd.DataFrame((survey_df['TRANSPARENCY.PRIVACY.BELIEFS'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[74]:


ax = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.BELIEFS'].value_counts()).plot(kind='barh', figsize=[10,6])
plt.title("Which of the following statements is closest to your\nbeliefs about attribution in software development?")
ax.set_yticklabels(["People should be able to contribute\ncode without attribution, if\nthey wish to remain anonymous.",
                    "Records of authorship should be\nrequired so that end users know\nwho created the source code they are working with."])
t = ax.set_xlabel("Count of responses")


# ### In general, how much information about you is publicly available online?
# INFO.AVAILABILITY

# In[75]:


count_df = pd.DataFrame(survey_df['INFO.AVAILABILITY'].value_counts())
count_df.columns=["count"]
count_df


# In[76]:


prop_df = pd.DataFrame((survey_df['INFO.AVAILABILITY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[77]:


ax = pd.DataFrame(survey_df['INFO.AVAILABILITY'].value_counts()).plot(kind='barh')
plt.title("In general, how much information about\nyou is publicly available online?")
t = ax.set_xlabel("Count of responses")


# ### Do you feel that you need to make information available about yourself online for professional reasons?
# INFO.JOB
# 

# In[78]:


count_df = pd.DataFrame(survey_df['INFO.JOB'].value_counts())
count_df.columns = ["count"]
count_df


# In[79]:


prop_df = pd.DataFrame((survey_df['INFO.JOB'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[80]:


ax = pd.DataFrame(survey_df['INFO.JOB'].value_counts()).plot(kind='barh', figsize=[10,6])
plt.title("Do you feel that you need to make information available\nabout yourself online for professional reasons?")
t = ax.set_xlabel("Count of responses")


# ### General privacy practices
# 
# TRANSPARENCY.PRIVACY.PRACTICES.GENERAL
# 
# "Which of the following best describes your practices around publishing content online, such as posts on social media (e.g. Facebook, Instagram, Twitter, etc.), blogs, and other platforms (not including contributions to open source projects)?" (single choice)
# 

# In[81]:


counts_df = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.PRACTICES.GENERAL'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[82]:


prop_df = pd.DataFrame((survey_df['TRANSPARENCY.PRIVACY.PRACTICES.GENERAL'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[83]:


plot_counts_df = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.PRACTICES.GENERAL'].value_counts())
idx = ['I include my real name.',
       'I usually use a consistent pseudonym that\nis easily linked to my real name online.',
       'I don\'t publish this kind of content online.',
       'I usually use a consistent pseudonym that\nis not linked anywhere with my real name online.',
       'I take precautions to use different\npseudonymns on different platforms.']
plot_counts_df.index = idx


# In[84]:


ax = plot_counts_df.plot(kind='barh', figsize=[12,6])
plt.title("Which of the following best describes your\npractices around publishing content online [...] \nnot including contributions to open source projects?")
t = ax.set_xlabel("Count of responses")


# ### OSS privacy practices
# 
# "Which of the following best describes your practices when making open source contributions?"

# In[85]:


counts_df = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.PRACTICES.OSS'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[86]:


prop_df = pd.DataFrame((survey_df['TRANSPARENCY.PRIVACY.PRACTICES.OSS'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[87]:


plot_counts_df = pd.DataFrame(survey_df['TRANSPARENCY.PRIVACY.PRACTICES.OSS'].value_counts())
idx = ['I include my real name.',
       'I usually use a consistent pseudonym that\nis easily linked to my real name online.',
       'I usually use a consistent pseudonym that\nis not linked anywhere with my real name online.',
       'I take precautions to use different\npseudonymns on different platforms.']
plot_counts_df.index = idx


# In[88]:


ax = plot_counts_df.plot(kind='barh', figsize=[12,6])
plt.title("Which of the following best describes your\npractices when making open source contributions?")
t = ax.set_xlabel("Count of responses")


# ## Mentorship / Help

# ### Have you ever received any kind of help from other people related to using or contributing to an open source project?
# RECEIVED.HELP

# In[89]:


counts_df = pd.DataFrame(survey_df['RECEIVED.HELP'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[90]:


prop_df = pd.DataFrame((survey_df['RECEIVED.HELP'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[91]:


ax = pd.DataFrame(survey_df['RECEIVED.HELP'].value_counts()).plot(kind='barh')
plt.title("Have you ever received any kind of help from other people\nrelated to using or contributing to an open source project?")
t = ax.set_xlabel("Count of responses")


# ### Thinking of the most recent case where someone helped you, how did you find someone to help you?

# In[92]:


counts_df = pd.DataFrame(survey_df['FIND.HELPER'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[93]:


prop_df = pd.DataFrame((survey_df['FIND.HELPER'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[94]:


ax = pd.DataFrame(survey_df['FIND.HELPER'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
plt.title("How did you find someone to help you?")
t = ax.set_yticklabels(['I asked for help in a public forum\n(e.g. in a GitHub Issue, project mailing list, etc.)\nand someone responded.',
 'I asked a specific person for help.',
 'Someone offered me unsolicited help.',
 'Other - Please describe'])


# ### Which best describes your prior relationship with the person who helped you?
# HELPER.PRIOR.RELATIONSHIP

# In[95]:


counts_df = pd.DataFrame(survey_df['HELPER.PRIOR.RELATIONSHIP'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[96]:


prop_df = pd.DataFrame((survey_df['HELPER.PRIOR.RELATIONSHIP'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[97]:


ax = pd.DataFrame(survey_df['HELPER.PRIOR.RELATIONSHIP'].value_counts()).plot(kind='barh')
plt.title("Which best describes your prior\nrelationship with the person who helped you?")
t = ax.set_xlabel("Count of responses")


# ### What kind of problem did they help you with?
# RECEIVED.HELP.TYPE

# In[98]:


counts_df = pd.DataFrame(survey_df['RECEIVED.HELP.TYPE'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[99]:


prop_df = pd.DataFrame((survey_df['RECEIVED.HELP.TYPE'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[100]:


ax = pd.DataFrame(survey_df['RECEIVED.HELP.TYPE'].value_counts()).plot(kind='barh')
plt.title("What kind of problem did they help you with?")
t = ax.set_xlabel("Count of responses")

t = ax.set_yticklabels(['Writing code or otherwise implementing ideas.',
       'Installing or using an application.',
       'Understanding community norms (e.g. how to submit\na contribution, how to communicate effectively).',
       'Other (please describe)', 'Introductions to other people'])


# ### Have you ever provided help for another person on an open source project?
# PROVIDED.HELP

# In[101]:


counts_df = pd.DataFrame(survey_df['PROVIDED.HELP'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[102]:


prop_df = pd.DataFrame((survey_df['PROVIDED.HELP'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[103]:


ax = pd.DataFrame(survey_df['PROVIDED.HELP'].value_counts()).plot(kind='barh')
plt.title("Have you ever provided help for another\nperson on an open source project?")
t = ax.set_xlabel("Count of responses")


# In[ ]:


# ### Thinking of the most recent case where you helped someone, how did you come to help this person?
# FIND.HELPEE

# In[104]:


counts_df = pd.DataFrame(survey_df['FIND.HELPEE'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[105]:


prop_df = pd.DataFrame((survey_df['FIND.HELPEE'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[106]:


ax = pd.DataFrame(survey_df['FIND.HELPEE'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("How did you come to help this person?")
t = ax.set_yticklabels(['They asked for help in a public forum\n(e.g. in a GitHub Issue, project mailing list, etc.)\n and I responded.',
       'They asked me directly for help.',
       'I reached out to them to offer unsolicited help.',
       'Other (please describe)'])


# ### Which best describes your prior relationship with the person you helped?
# HELPEE.PRIOR.RELATIONSHIP

# In[107]:


counts_df = pd.DataFrame(survey_df['HELPEE.PRIOR.RELATIONSHIP'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[108]:


prop_df = pd.DataFrame((survey_df['HELPEE.PRIOR.RELATIONSHIP'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[109]:


ax = pd.DataFrame(survey_df['HELPEE.PRIOR.RELATIONSHIP'].value_counts()).plot(kind='barh')
plt.title("Which best describes your prior\nrelationship with the person you helped?")
t = ax.set_xlabel("Count of responses")


# ### What kind of problem did you help them with?
# PROVIDED.HELP.TYPE

# In[110]:


counts_df = pd.DataFrame(survey_df['PROVIDED.HELP.TYPE'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[111]:


prop_df = pd.DataFrame((survey_df['PROVIDED.HELP.TYPE'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[112]:


ax = pd.DataFrame(survey_df['PROVIDED.HELP.TYPE'].value_counts()).plot(kind='barh')
plt.title("What kind of problem did you help them with?")
t = ax.set_xlabel("Count of responses")
t = ax.set_yticklabels(['Writing code or otherwise implementing ideas.',
       'Installing or using an application.', 'Other (please describe)',
       'Understanding community norms (e.g. how to\nsubmit a contribution, how to communicate effectively).',
       'Introductions to other people.'])


# ## Open Source Software in Paid Work

# ### Do you contribute to open source as part of your professional work? 
# 
# OSS.AS.JOB: Do you contribute to open source as part of your professional work? In other words, are you paid for any of your time spent on open source contributions?
# 
# - Yes, indirectly- I contribute to open source in carrying out my work duties, but I am not required or expected to do so.	
# - No.	
# - Yes, directly- some or all of my work duties include contributing to open source projects.

# In[113]:


counts_df = pd.DataFrame(survey_df['OSS.AS.JOB'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[114]:


prop_df = pd.DataFrame((survey_df['OSS.AS.JOB'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[115]:


oss_as_job_df = pd.DataFrame(survey_df['OSS.AS.JOB'].value_counts())
oss_as_job_df.index = ["Yes, indirectly", "No", "Yes, directly"]
ax = oss_as_job_df.plot(kind='barh')
plt.title("Do you contribute to open source\nas part of your professional work?")
t = ax.set_xlabel("Count of responses")


# ### How often do you use open source software in your professional work?
# OSS.AT.WORK

# In[116]:


counts_df = pd.DataFrame(survey_df['OSS.AT.WORK'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[117]:


prop_df = pd.DataFrame((survey_df['OSS.AT.WORK'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[118]:


ax = pd.DataFrame(survey_df['OSS.AT.WORK'].value_counts()).plot(kind='barh')
plt.title("How often do you use open source\nsoftware in your professional work?")
t = ax.set_xlabel("Count of responses")


# ### How does your employer's intellectual property agreement/policy affect your free-time contributions to open source unrelated to your work?
# OSS.IP.POLICY

# In[119]:


counts_df = pd.DataFrame(survey_df['OSS.IP.POLICY'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[120]:


prop_df = pd.DataFrame((survey_df['OSS.IP.POLICY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[121]:


ax = pd.DataFrame(survey_df['OSS.IP.POLICY'].value_counts()).plot(kind='barh')
plt.title("How does your employer's intellectual property\nagreement/policy affect your free-time contributions\nto open source unrelated to your work?")
t = ax.set_xlabel("Count of responses")


# ### Which is closest to your employer’s policy on using open source software applications?

# In[122]:


counts_df = pd.DataFrame(survey_df['EMPLOYER.POLICY.APPLICATIONS'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[123]:


prop_df = pd.DataFrame((survey_df['EMPLOYER.POLICY.APPLICATIONS'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[124]:


ax = pd.DataFrame(survey_df['EMPLOYER.POLICY.APPLICATIONS'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Which is closest to your employer’s policy\non using open source software applications?")


# ### How important do you think your involvement in open source was to getting your current job?
# OSS.HIRING

# In[125]:


counts_df = pd.DataFrame(survey_df['OSS.HIRING'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[126]:


prop_df = pd.DataFrame((survey_df['OSS.HIRING'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[127]:


ax = pd.DataFrame(survey_df['OSS.HIRING'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("How important do you think your involvement\nin open source was to getting your current job?")


# ## Demographics

# ### Do you currently live in a country other than the one in which you were born?
# IMMIGRATION

# In[128]:


counts_df = pd.DataFrame(survey_df['IMMIGRATION'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[129]:


prop_df = pd.DataFrame((survey_df['IMMIGRATION'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[130]:


ax = pd.DataFrame(survey_df['IMMIGRATION'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Do you currently live in a country other\nthan the one in which you were born?")


# ### Thinking of where you were born, are you a member of an ethnicity or nationality that is a considered a minority in that country?
# MINORITY.HOMECOUNTRY

# In[131]:


counts_df = pd.DataFrame(survey_df['MINORITY.HOMECOUNTRY'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[132]:


prop_df = pd.DataFrame((survey_df['MINORITY.HOMECOUNTRY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[133]:


ax = pd.DataFrame(survey_df['MINORITY.HOMECOUNTRY'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Thinking of where you were born, are\nyou a member of an ethnicity or nationality that\nis a considered a minority in that country?")


# In[ ]:


# ### Thinking of where you currently live, are you a member of an ethnicity or nationality that is a considered a minority in that country?
# 
# MINORITY.CURRENT.COUNTRY

# In[134]:


counts_df = pd.DataFrame(survey_df['MINORITY.CURRENT.COUNTRY'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[135]:


prop_df = pd.DataFrame((survey_df['MINORITY.CURRENT.COUNTRY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[136]:


ax = pd.DataFrame(survey_df['MINORITY.CURRENT.COUNTRY'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Thinking of where you currently live, are you\na member of an ethnicity or nationality that is a\nconsidered a minority in that country?")


# ### What is your gender?
# GENDER

# In[137]:


counts_df = pd.DataFrame(survey_df['GENDER'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[138]:


prop_df = pd.DataFrame((survey_df['GENDER'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[139]:


ax = pd.DataFrame(survey_df['GENDER'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("What is your gender?")


# ### Do you identify as transgender?
# TRANSGENDER.IDENTITY
# 

# In[140]:


counts_df = pd.DataFrame(survey_df['TRANSGENDER.IDENTITY'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[141]:


prop_df = pd.DataFrame((survey_df['TRANSGENDER.IDENTITY'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[142]:


ax = pd.DataFrame(survey_df['TRANSGENDER.IDENTITY'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Do you identify as transgender?")


# In[ ]:


# ### Do you identify as gay, lesbian, or bisexual, asexual, or any other minority sexual orientation?
# SEXUAL.ORIENTATION

# In[143]:


counts_df = pd.DataFrame(survey_df['SEXUAL.ORIENTATION'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[144]:


prop_df = pd.DataFrame((survey_df['SEXUAL.ORIENTATION'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[145]:


ax = pd.DataFrame(survey_df['SEXUAL.ORIENTATION'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("Do you identify as gay, lesbian, or bisexual,\nasexual, or any other minority sexual orientation?")


# ### How well can you read and write in English?
# WRITTEN.ENGLISH

# In[146]:


counts_df = pd.DataFrame(survey_df['WRITTEN.ENGLISH'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[147]:


prop_df = pd.DataFrame((survey_df['WRITTEN.ENGLISH'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[148]:


ax = pd.DataFrame(survey_df['WRITTEN.ENGLISH'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("How well can you read and write in English?")


# In[ ]:


# ### What is your age?
# AGE

# In[149]:


counts_df = pd.DataFrame(survey_df['AGE'].value_counts().sort_index())
counts_df.columns = ["count"]
counts_df


# In[150]:


prop_df = pd.DataFrame((survey_df['AGE'].value_counts(normalize=True).round(4)*100).sort_index())
prop_df.columns=["percent"]
prop_df


# In[151]:


ax = pd.DataFrame(survey_df['AGE'].value_counts().sort_index()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("What is your age?")


# In[ ]:


# ### What is highest level of formal education that you have completed?
# FORMAL.EDUCATION

# In[152]:


counts_df = pd.DataFrame(survey_df['FORMAL.EDUCATION'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[153]:


prop_df = pd.DataFrame((survey_df['FORMAL.EDUCATION'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[154]:


order = ["Less than secondary (high) school",
         "Secondary (high) school graduate or equivalent",
         "Vocational/trade program or apprenticeship",
         "Some college, no degree",
         "Bachelor's degree",
         "Master's degree",
         "Doctorate (Ph.D.) or other advanced degree (e.g. M.D., J.D.)"]

edu_counts_df = survey_df['FORMAL.EDUCATION'].value_counts()[order]

ax = edu_counts_df.plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("What is highest level of formal education that you have completed?")


# In[ ]:


# ### What is the highest level of formal education that either of your parents completed?
# PARENTS.FORMAL.EDUCATION

# In[155]:


counts_df = pd.DataFrame(survey_df['PARENTS.FORMAL.EDUCATION'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[156]:


prop_df = pd.DataFrame((survey_df['PARENTS.FORMAL.EDUCATION'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[157]:


order = ["Less than secondary (high) school",
         "Secondary (high) school graduate or equivalent",
         "Vocational/trade program or apprenticeship",
         "Some college, no degree",
         "Bachelor's degree",
         "Master's degree",
         "Doctorate (Ph.D.) or other advanced degree (e.g. M.D., J.D.)"]

edu_counts_df = survey_df['PARENTS.FORMAL.EDUCATION'].value_counts()[order]

ax = edu_counts_df.plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("What is highest level of formal education that either of your parents completed?")


# In[ ]:


# In[ ]:


# ### How old were you when you first had regular access to a computer with an internet connection?
# AGE.AT.FIRST.COMPUTER.INTERNET

# In[158]:


counts_df = pd.DataFrame(survey_df['AGE.AT.FIRST.COMPUTER.INTERNET'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[159]:


prop_df = pd.DataFrame((survey_df['AGE.AT.FIRST.COMPUTER.INTERNET'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[160]:


ax = pd.DataFrame(survey_df['AGE.AT.FIRST.COMPUTER.INTERNET'].value_counts()).plot(kind='barh')
ax.set_xlabel("Count of responses")
t = plt.title("How old were you when you first had regular\naccess to a computer with an internet connection?")


# In[ ]:


# ### Where did you first have regular access to a computer with internet connection?
# LOCATION.OF.FIRST.COMPUTER.INTERNET

# In[161]:


counts_df = pd.DataFrame(survey_df['LOCATION.OF.FIRST.COMPUTER.INTERNET'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[162]:


prop_df = pd.DataFrame((survey_df['LOCATION.OF.FIRST.COMPUTER.INTERNET'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[163]:


ax = pd.DataFrame(survey_df['LOCATION.OF.FIRST.COMPUTER.INTERNET'].value_counts()).plot(kind='barh', figsize=[8.5,6])
ax.set_xlabel("Count of responses")
t = plt.title("Where did you first have regular access to a\ncomputer with internet connection?")


# ### Where was the respondent surveyed from?
# POPLATION

# In[164]:


counts_df = pd.DataFrame(survey_df['POPULATION'].value_counts())
counts_df.columns = ["count"]
counts_df


# In[165]:


prop_df = pd.DataFrame((survey_df['POPULATION'].value_counts(normalize=True).round(4)*100))
prop_df.columns=["percent"]
prop_df


# In[166]:


ax = pd.DataFrame(survey_df['POPULATION'].value_counts()).plot(kind='barh', figsize=[8.5,6])
ax.set_xlabel("Count of responses")
t = plt.title("Where was the respondent surveyed from?")


# ## Harassment / Inclusiveness of OSS

# ### Have you ever observed any of the following in the context of an open source project?
# DISCOURAGING.BEHAVIOR.*

# In[167]:


discouraging_responses = survey_df[discouraging_vars].apply(pd.Series.value_counts).transpose()[["Yes", "No"]]
discouraging_responses


# In[168]:


discouraging_percent = pd.DataFrame(discouraging_responses["Yes"] / discouraging_responses.sum(axis=1) * 100, columns=["percent_yes"]).sort_values(by="percent_yes")
discouraging_percent.round(2)


# In[169]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
discouraging_responses.sort_values(by="No").plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[22:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("Have you ever observed any of the following in the context of an open source project?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# In[ ]:


# ### Have you ever witnessed any of the following behaviors directed at another person in the context of an open source project? (not including something directed at you)
# 
# NEGATIVE.WITNESS.*

# In[170]:


neg_witness_responses = neg_df[neg_witness_vars].apply(pd.Series.value_counts).transpose()[[1,0]]
neg_witness_responses.columns = ["Yes", "Blank"]
neg_witness_responses


# Only 3,664 respondents clicked any boxes in this question, meaning 2,365 did not click the "none of the above" or an option (or even get to this question). We have to adjust the no responses accordingly.

# In[171]:


neg_witness_responses_adj = neg_witness_responses
neg_witness_responses_adj["Blank"] = neg_witness_responses_adj["Blank"] - 2365
neg_witness_responses_adj_df = pd.DataFrame(neg_witness_responses_adj["Yes"] / (neg_witness_responses_adj["Yes"] + neg_witness_responses_adj["Blank"]) * 100, columns=["percent_yes"])


# In[172]:


neg_witness_responses_adj.columns = ["Yes", "No"]
neg_witness_responses_adj[:-1]


# In[173]:


neg_witness_responses_adj_df.sort_values(by="percent_yes").round(2)


# In[174]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
neg_witness_responses_adj[:-1].sort_values(by='No').plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[17:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("Have you ever witnessed any of the following behaviors\ndirected at another person in the context of an open source\nproject? (not including something directed at you)")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### Have you ever experienced any of the following behaviors directed at you in the context of an open source project?

# In[175]:


neg_exp_responses = neg_df[neg_exp_vars].apply(pd.Series.value_counts).transpose()[[1,0]]
neg_exp_responses.columns = ["Yes", "Blank"]
neg_exp_responses


# Only 3,638 respondents clicked any boxes in this question, meaning 2,391 did not click the "none of the above" (or even get to this question). We have to adjust the no responses accordingly.

# In[176]:


neg_exp_responses_adj = neg_exp_responses
neg_exp_responses_adj["Blank"] = neg_exp_responses_adj["Blank"] - 2391
neg_exp_responses_adj_df = pd.DataFrame(neg_exp_responses["Yes"] / (neg_exp_responses_adj["Yes"] + neg_exp_responses_adj["Blank"]) * 100, columns=["percent_yes"])


# In[177]:


neg_exp_responses_adj.columns = ["Yes", "No"]
neg_exp_responses_adj[:-1]


# In[178]:


neg_exp_responses_adj_df.sort_values(by="percent_yes").round(2)


# In[179]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
neg_exp_responses_adj[:-1].sort_values(by='No').plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[20:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("Have you ever experienced any of the following behaviors\ndirected at you in the context of an open source project?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### Thinking of the last time you experienced harassment, how did you respond? 
# NEGATIVE.RESPONSE.*

# In[180]:


neg_resp_responses = neg_df[neg_resp_vars].apply(pd.Series.value_counts).transpose()[[1,0]]
neg_resp_responses.columns = ["Yes", "Blank"]
neg_resp_responses


# Only 719 respondents clicked any boxes in this question, meaning 5,310 did not click on "I did not react / ignored the incident" or any response (or even get to this question). We have to adjust the no responses accordingly.

# In[181]:


neg_resp_responses_adj = neg_resp_responses
neg_resp_responses_adj["Blank"] = neg_resp_responses_adj["Blank"] - 5310
neg_resp_responses_adj_df = pd.DataFrame(neg_resp_responses_adj["Yes"] / (neg_resp_responses_adj["Yes"] + neg_resp_responses_adj["Blank"]) * 100, columns=["percent_yes"])


# In[182]:


neg_resp_responses_adj.columns = ["Yes", "No"]
neg_resp_responses_adj[:-1]


# In[183]:


neg_resp_responses_adj_df.sort_values(by="percent_yes").round(2)


# In[184]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
neg_resp_responses_adj[:-1].sort_values(by='No').plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[18:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("Thinking of the last time you experienced\nharassment, how did you respond?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# In[ ]:


# ### How effective were the following responses?
# RESPONSE.EFFECTIVENESS.*

# In[185]:


neg_effect_responses = neg_df[neg_effect_vars].apply(pd.Series.value_counts).transpose()
neg_effect_responses = neg_effect_responses.replace(np.nan, 0).sort_values(by="Mostly effective")
neg_effect_responses = neg_effect_responses[["Not at all effective", "A little effective", "Somewhat effective", "Mostly effective", "Completely effective"]]


# In[186]:


idx = []
for i in neg_effect_responses.index:
    idx.append(i[23:].replace(".", " "))
neg_effect_responses.index = idx
neg_effect_responses.astype(int)


# In[187]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues
neg_effect_responses.plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

plt.title("How effective were the following responses?\n(counting number of responses)")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# In[188]:


neg_effect_responses_prop = neg_df[neg_effect_vars].apply(pd.Series.value_counts, normalize=True).round(4).transpose()
neg_effect_responses_prop = neg_effect_responses_prop.replace(np.nan, 0).sort_values(by="Completely effective")
neg_effect_responses_prop = neg_effect_responses_prop[["Not at all effective", "A little effective", "Somewhat effective", "Mostly effective", "Completely effective"]]
neg_effect_responses_prop = neg_effect_responses_prop * 100


# In[189]:


idx = []
for i in neg_effect_responses_prop.index:
    idx.append(i[23:].replace(".", " "))
neg_effect_responses_prop.index = idx
neg_effect_responses_prop


# In[190]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues
neg_effect_responses_prop.plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

plt.title("How effective were the following responses?\n(proportion of responses)")

plt.xlabel("Proportion of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# ### As a result of experiencing or witnessing harassment, which, if any, of the following have you done?
# NEGATIVE.CONSEQUENCES.*

# In[ ]:


# In[191]:


neg_conseq_responses = neg_df[neg_conseq_vars].apply(pd.Series.value_counts).transpose()[[1,0]]
neg_conseq_responses.columns = ["Yes", "Blank"]
neg_conseq_responses


# Only 1,953 respondents clicked any boxes in this question, meaning 4,076 did not click a response or the "none of the above" option (or even get to the question). We have to adjust the no responses accordingly.

# In[192]:


neg_conseq_responses_adj = neg_conseq_responses
neg_conseq_responses_adj["Blank"] = neg_conseq_responses_adj["Blank"] - 4076
neg_conseq_responses_adj_df = pd.DataFrame(neg_conseq_responses_adj["Yes"] / (neg_conseq_responses_adj["Yes"] + neg_conseq_responses_adj["Blank"]) * 100, columns=["percent_yes"])


# In[193]:


neg_conseq_responses_adj.columns = ['Yes', 'No']
neg_conseq_responses_adj[:-1]


# In[194]:


neg_conseq_responses_adj_df.sort_values(by="percent_yes").round(2)


# In[195]:


sns.set(style="whitegrid", font_scale=1.75)
fig, ax = plt.subplots()
cmap=matplotlib.cm.Blues_r
neg_conseq_responses_adj[:-1].sort_values(by='No').plot.barh(stacked=True, ax=ax, figsize=[12,6], cmap=cmap, edgecolor='black', linewidth=1)

labels = []
for l in ax.get_yticklabels():
    title_text = l.get_text()[22:].replace(".", " ") # cut off "CONTRIBUTOR.TYPE"
        
    labels.append(title_text)
    
ax.set_yticklabels(labels)


plt.title("As a result of experiencing or witnessing\nharassment, which, if any, of the following have you done?")

plt.xlabel("Number of responses")


legend = plt.legend(fancybox=True, loc='upper center', bbox_to_anchor=(.5, -.13), ncol=4, shadow=True)
legend.get_frame().set_edgecolor('b')
legend.get_frame().set_facecolor('white')


# In[ ]: