#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import iplot, init_notebook_mode
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (20, 6)


# ## 问题
# 
# - 使用 Python 2 和 Python 3 的开发者的比例？
# - 做数据分析和机器学习的人中分别有多少人使用的是 Python 3？
# - 常用框架中使用 Python 2 和 Python 3 的比例？
# - 做数据分析和机器学习的人常用的框架？
# - 公司规模大小和是否使用 Python 3 的关系？
# - 开发者年龄和是否使用 Python 3 的关系？
# - 使用 Python 3 和 Python 2 的开发者的国别分布？
# - 开发者中使用 IDE 的情况？

# ## 读取数据集

# In[3]:


survey_df = pd.read_csv('pythondevsurvey2017_raw_data.csv')
survey_df.columns = [c.lower() for c in survey_df.columns]
survey_df.head()


# In[4]:


survey_df.shape


# In[5]:


def find_cols(df, kws):
    '''找到 df 中含有 kws 的列'''
    return [item for item in df.columns if all ([w in item for w in kws])]


# In[6]:


find_cols(df=survey_df, kws=['python', 'version'])


# ## 使用 Python 2 和 Python 3 的开发者的比例？

# In[7]:


python_version = survey_df['which version of python do you use the most?']
python_version.describe()


# In[8]:


python_version.value_counts(normalize=True, dropna=False)


# In[9]:


python_version.value_counts(normalize=False, dropna=False)


# In[10]:


python_version.value_counts(normalize=True, dropna=True)


# In[11]:


python_version.value_counts(normalize=True, dropna=True).plot(kind='pie', 
                                                              figsize=(5, 5), 
                                                              startangle=90, 
                                                              autopct='%.0f%%', 
                                                              fontsize=14,
                                                              colors=sns.color_palette('rainbow')[:2])
plt.title('Python 2 VS Python 3', fontsize=18)
plt.ylabel('');
plt.tight_layout()
plt.savefig('python-version.png')


# 在使用 Python 的开发者中，大概有 75% 的人已经在使用 Python 3 了。

# ## 做数据分析和机器学习的人中分别有多少人使用的是 Python 3？

# In[12]:


python_da_ml = survey_df[['machine learning:\xa0what do you use python for?', 'data analysis:\xa0what do you use python for?', 'which version of python do you use the most?']]


# In[13]:


python_da_ml.dtypes


# In[14]:


python_da = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['data analysis:\xa0what do you use python for?'], normalize=True)


# In[15]:


python_da


# In[16]:


python_ml = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['machine learning:\xa0what do you use python for?'], normalize=True)


# In[17]:


pd.concat([python_da, python_ml], axis=1)


# In[18]:


pd.concat([python_da, python_ml], axis=1).T.plot(kind='bar', figsize=(10, 5), color=sns.color_palette('rainbow'))
plt.xticks(rotation=0, fontsize=14)
plt.title('Data Analysis and Machine Learning VS Python version', fontsize=18)
plt.legend(title=None)
plt.tight_layout()
plt.savefig('data-analysis-machine-learning-vs-python-version.png')


# ---

# In[19]:


cols = find_cols(survey_df, 'what framework(s) do you use in addition to python?')
cols


# In[20]:


frameworks = survey_df[cols[1:]]
frameworks.head()


# In[21]:


count_df = frameworks.count().sort_values(ascending=False)
count_df.index = [item.split(':')[0] for item in count_df.index]

count_df.plot(kind='bar', color=sns.color_palette('rainbow', frameworks.shape[1]))
plt.xticks(fontsize=14)


# In[22]:


values = frameworks.count().sort_values(ascending=False).values
labels = [item.split(':')[0] for item in frameworks.count().sort_values(ascending=False).index]

plt.figure(figsize=(20, 17))
sns.barplot(x=values, y=labels, orient='h', palette=sns.color_palette("rainbow", 24))
plt.xticks(fontsize=14)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig('frameworks.png')


# ## 常用框架中使用 Python 2 和 Python 3 的比例

# In[23]:


python_ver = survey_df['which version of python do you use the most?']


# In[24]:


def process_col(col):
    return pd.crosstab(index=python_ver, columns=col).iloc[:, 0]


# In[25]:


# process_col(frameworks['django:what framework(s) do you use in addition to python?'])


# In[26]:


frameworks_pyver = frameworks.apply(lambda col: pd.crosstab(index=python_ver, columns=col).iloc[:, 0])
frameworks_pyver.columns = [item.split(':')[0] for item in frameworks.columns]
frameworks_pyver


# In[27]:


frameworks_pyver_ratio = frameworks_pyver / frameworks_pyver.sum(axis=0)


# In[28]:


frameworks_pyver_ratio.T.plot(kind='bar', color=sns.color_palette('rainbow'))
plt.xticks(rotation=90, fontsize=14)


# In[29]:


df = frameworks_pyver_ratio.stack().reset_index()
df.columns=['pyver', 'framework', 'value']
df.head()


# In[30]:


plt.figure(figsize=(20, 17))
sns.barplot(x='value', y='framework', hue='pyver', data=df, orient='h', palette=sns.color_palette('rainbow'))
plt.yticks(fontsize=18)


# In[31]:


sns.distplot(frameworks_pyver_ratio.iloc[0, :], bins=5, color='b')


# In[32]:


sns.stripplot(x='framework', y='value', hue='pyver', data=df, size=5, palette=sns.color_palette('rainbow'))
plt.xticks(rotation=90, fontsize=14);
plt.legend(title=None)


# In[33]:


plt.figure(figsize=(7, 13))
sns.stripplot(x='value', y='framework', hue='pyver', data=df, orient='h', size=5, palette=sns.color_palette('rainbow'))
plt.yticks(fontsize=14)
plt.legend(title=None)


# In[34]:


plt.figure(figsize=(13, 13))
sns.stripplot(x='value', y='framework', hue='pyver', 
              data=df, 
              order=frameworks_pyver_ratio.T['Python 3'].sort_values(ascending=False).index, 
              orient='h', 
              size=7, 
              palette=sns.color_palette('rainbow'))
plt.yticks(fontsize=14)
plt.xlabel('')
plt.ylabel('')
plt.legend(title=None, loc='upper center')
plt.tight_layout()
plt.savefig('frameworks-python-version.png')


# ## 做数据分析和机器学习的人常用的框架？

# In[35]:


cols = find_cols(survey_df, ['use', 'python', 'most'])
cols


# In[36]:


uses = survey_df['what do you use python for the most?']
uses.head()


# In[37]:


frameworks_uses = frameworks.apply(lambda col: pd.crosstab(index=uses, columns=col).iloc[:, 0])
frameworks_uses.columns = [item.split(':')[0] for item in frameworks_uses.columns]
frameworks_uses.head()


# In[38]:


da_ml_frameworks_uses = frameworks_uses.loc[['Data analysis', 'Machine learning']]
da_ml_frameworks_uses.head()


# In[39]:


da_ml_frameworks_uses.T.sort_values(by='Data analysis').plot.area(stacked=False, alpha=0.5, figsize=(20, 15), 
                                                                  color=sns.color_palette('rainbow')[:2])
plt.xticks(range(da_ml_frameworks_uses.shape[1]), 
           da_ml_frameworks_uses.T.sort_values(by='Data analysis').index, 
           rotation=90, fontsize=18);
plt.yticks(fontsize=14)
plt.legend(fontsize=16)
plt.tight_layout()
plt.savefig('frameworks-data-analysis-machine-learning.png')


# In[40]:


plt.figure(figsize=(10, 15))
df = da_ml_frameworks_uses.stack().reset_index()
df.columns = ['use', 'framework', 'value']
sns.barplot(x='value', y='framework', hue='use', 
            data=df, 
            orient='h', 
            order=da_ml_frameworks_uses.T.sort_values(by='Data analysis', ascending=False).index,
            palette=sns.color_palette('rainbow'))
plt.yticks(fontsize=16)


# 可以看到数据分析和机器学习从业者使用的框架大致差不多，只是在 keras、theano、tensorflow 和 scikit-learn 等机器学习库上差别较大，当然这也理所当然。
# 
# 只是让我想不到的是 web 框架 Django 和 Flask 能够排的这么前。

# ## 公司规模大小和是否使用 Python 3 的关系？

# In[41]:


cols = find_cols(survey_df, ['how', 'many', 'people', 'project'])
cols


# In[42]:


team_scale = survey_df[cols[0]]
team_scale.head()


# In[43]:


team_scale.describe()


# In[44]:


team_scale.isnull().sum()


# In[45]:


team_pyver = pd.crosstab(team_scale, python_ver)
team_pyver = team_pyver.reindex(['2-7 people', '8-12 people', '13-20 people', '21-40 people', 'More than 40 people'])
team_pyver


# In[46]:


team_pyver_sorted = team_pyver.div(team_pyver.sum(axis=1), axis=0).sort_values(by='Python 3', ascending=False)
team_pyver_sorted


# In[47]:


team_pyver_sorted['Python 3'].plot(label='Python 3', marker='o', markersize=10, color='b', linewidth=3)
plt.xticks(range(5), team_pyver_sorted.index, fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('team scale', fontsize=16)
plt.ylabel('use ratio of python 3', fontsize=16)
plt.legend(fontsize=14)
plt.title('Team scale VS Use ratio of Python 3', fontsize=18)
plt.tight_layout()
plt.savefig('team-scale-python-3.png')


# 可以看到团队越小，使用 Python 3 的比例越高，可能这也是因为重构代价吧。

# ## 开发者年龄和是否使用 Python 3 的关系？

# In[48]:


cols = find_cols(survey_df, ['age', 'range'])
cols


# In[49]:


age = survey_df[cols[0]]
age.head()


# In[50]:


age.describe()


# In[51]:


age.isnull().sum()


# In[52]:


survey_df.loc[age.isnull()]


# In[53]:


age.unique()


# In[54]:


age_pyver = pd.crosstab(index=age, columns=python_ver)
age_pyver


# In[55]:


age_pyver.div(age_pyver.sum(axis=1), axis=0)['Python 3'].plot(label='Python 3',
                                                              marker='o', 
                                                              markersize=10, 
                                                              color='b', 
                                                              linewidth=3)
plt.xticks(range(age_pyver.shape[0]), age_pyver.index, fontsize=14);
plt.yticks(fontsize=14)
plt.xlabel('age', fontsize=16)
plt.ylabel('use ratio of python 3', fontsize=16)
plt.legend(fontsize=14)
plt.title("The developers' age VS The use ratio of Python 3", fontsize=18)
plt.tight_layout()
plt.savefig('age-python-3.png')


# 可以看到，随着年龄段的增长，Python 3 的使用比例在降低，但是到了 60 岁以上的时候反而提高了。。。

# In[56]:


china = survey_df.loc[survey_df['what country do you live in?'] == 'China']
usa = survey_df.loc[survey_df['what country do you live in?'] == 'United States']
china.head()


# In[57]:


country_age = pd.crosstab([survey_df['what country do you live in?'], survey_df['which version of python do you use the most?']], survey_df['could you tell us your age range?'])
country_age.index.names = ['country', 'pyver']
country_age.head()


# In[58]:


country_age_total = country_age.sum(level=0)
country_age_total.head()


# In[59]:


country_age_total['60 or older'].sum()


# In[60]:


country_age_total['60 or older'].sort_values(ascending=False)[:10].plot(kind='bar', color=sns.color_palette('rainbow', 10))
plt.xticks(fontsize=14, rotation=0)
plt.xlabel('')
plt.ylabel('# of the developers whose age are 60+', fontsize=14)
plt.title('Top 10 countries of # of the developers whose age are 60+', fontsize=18)
plt.tight_layout()
plt.savefig('top10-60-or-older.png')


# In[ ]:


# In[61]:


three_countries = country_age_total.loc[['United States', 'India', 'China']]
three_countries


# In[62]:


three_countries_ratio = three_countries.div(three_countries.sum(axis=1), axis=0)
three_countries_ratio


# In[63]:


three_countries_ratio.plot(kind='bar', figsize=(12, 6), color=sns.color_palette('rainbow', 7))
plt.xticks(range(3), ['United States', 'India', 'China'], rotation=0, fontsize=14);
plt.xlabel('')
plt.legend(title=None)
plt.title("Age distribution of the developers who're from USA, India and China", fontsize=18);
plt.tight_layout()
plt.savefig('age-distribution.png')


# In[64]:


country_age.loc['United States']


# In[65]:


41 / 56


# ## 使用 Python 3 和 Python 2 的开发者的国别分布？

# In[66]:


cols = find_cols(survey_df, ['country', 'live'])
cols


# In[67]:


countries = survey_df[cols[0]]
countries.head()


# In[68]:


countries.describe()


# In[69]:


countries.isnull().sum()


# In[70]:


survey_df.loc[countries.isnull()]


# In[71]:


sns.countplot(countries, order=countries.value_counts().index, palette=sns.color_palette('rainbow', 16))
plt.xlim(xmax=15.5);
# plt.xlabel('国家', fontsize=18, fontproperties=chinese);
# plt.ylabel('频数', fontsize=18, fontproperties=chinese);
plt.xticks(fontsize=14, rotation=15);
plt.xlabel('')
plt.ylabel('# of developers', fontsize=14)
plt.tight_layout()
plt.savefig('country-counts.png')


# In[72]:


countries_pyver = pd.crosstab(index=countries, columns=python_ver)
countries_pyver.head()


# In[73]:


top10_countries = countries_pyver.loc[countries.value_counts()[:10].index]
top10_countries = top10_countries.div(top10_countries.sum(axis=1), axis=0)
top10_countries


# In[74]:


df = top10_countries.stack().reset_index()
df.columns = ['country', 'pyver', 'value']
df.head()


# In[75]:


sns.barplot(x='country', y='value', hue='pyver', data=df, palette=sns.color_palette('rainbow'))
plt.xticks(fontsize=14)
plt.xlabel('')
plt.ylabel('use ratio', fontsize=14)
plt.legend(title=None, fontsize=12)
plt.tight_layout()
plt.savefig('country-pyver.png')


# In[76]:


countries_pyver_ratio = countries_pyver.div(countries_pyver.sum(axis=1), axis=0)
countries_pyver_ratio.head()


# In[77]:


plt.figure(figsize=(12, 6))
sns.distplot(countries_pyver_ratio['Python 3'], rug=True, color='b')
plt.xlabel('use ratio', fontsize=14)
plt.xticks(fontsize=12)
plt.title('Use ratio of Python 3 in the world', fontsize=18)
plt.tight_layout()
plt.savefig('use-ratio-of-python3-world.png')


# In[78]:


plt.figure(figsize=(10, 3))
sns.stripplot(x='Python 3', data=countries_pyver_ratio)


# In[79]:


import matplotlib.transforms as transforms
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(countries_pyver_ratio['Python 3'], c='b')
ax.axhline(countries_pyver_ratio['Python 3'].mean(), c='k', label='Mean')
plt.xticks([]);
xmin, _ = plt.xlim()
ax.text(0, countries_pyver_ratio['Python 3'].mean(), round(countries_pyver_ratio['Python 3'].mean(), 2), 
        transform=transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData), 
        rotation='horizontal', 
        horizontalalignment='right', 
        verticalalignment='center');
ax.legend()


# In[80]:


plt.figure(figsize=(10, 5))
sns.violinplot(countries_pyver_ratio['Python 3'], palette=sns.color_palette('rainbow'))


# In[81]:


countries_pyver_ratio['Python 3'].describe()


# In[82]:


countries_pyver_ratio['Python 2'].describe()


# In[83]:


init_notebook_mode(connected=True)


# In[84]:


data = [ dict(
        type = 'choropleth',
        locations = countries_pyver_ratio.index,
        locationmode = 'country names', 
        z = countries_pyver_ratio['Python 3'] * 100,
        text = countries_pyver_ratio.index,
        colorscale = 'Set3',
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            ticksuffix = '%',
            title = 'Percent'),
      ) ]

layout = dict(
    title = 'Python 3 in the world',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
iplot(fig, validate=False, filename='Python 3 in the world', image_height=1080, image_width=1920, show_link=False)


# ## 开发者中使用 IDE 的情况？

# In[7]:


cols = find_cols(survey_df, ['what', 'editor(s)/ide(s)'])
cols


# In[8]:


editors = survey_df[cols]
editors.columns = [item.split(':')[0] for item in editors.columns]
editors.head()


# In[11]:


values = editors.count().sort_values(ascending=False).values
labels = editors.columns

plt.figure(figsize=(20, 17))
sns.barplot(x=values, y=labels, orient='h', palette=sns.color_palette("rainbow", len(labels)))
plt.xticks(fontsize=14)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.title('What editor(s)/IDE(s) have you considered for use in your Python development?', fontsize=20)
# plt.savefig('frameworks.png')


# In[12]:


col = find_cols(survey_df, ['what', 'main', 'editor'])
col


# In[16]:


main_editor = survey_df[col[0]]
main_editor.head()


# In[38]:


main_editor_sorted = main_editor.value_counts(ascending=False)
values = main_editor_sorted.values
labels = main_editor_sorted.index

plt.figure(figsize=(20, 17))
sns.barplot(x=values, y=labels, orient='h', palette=sns.color_palette("rainbow", len(labels)))
plt.xticks(fontsize=14)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.title('What is the main editor you use for your current python development?', fontsize=20);


# In[ ]: