import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import iplot, init_notebook_mode
%matplotlib inline
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (20, 6)
survey_df = pd.read_csv('pythondevsurvey2017_raw_data.csv')
survey_df.columns = [c.lower() for c in survey_df.columns]
survey_df.head()
is python the main language you use for your current projects? | none:what other language(s) do you use? | java:what other language(s) do you use? | javascript:what other language(s) do you use? | c/c++:what other language(s) do you use? | php:what other language(s) do you use? | c#:what other language(s) do you use? | ruby:what other language(s) do you use? | bash / shell:what other language(s) do you use? | objective-c:what other language(s) do you use? | ... | technical support:which of the following best describes your job role(s)? | data analyst:which of the following best describes your job role(s)? | business analyst:which of the following best describes your job role(s)? | team lead:which of the following best describes your job role(s)? | product manager:which of the following best describes your job role(s)? | cio / ceo / cto:which of the following best describes your job role(s)? | systems analyst:which of the following best describes your job role(s)? | other - write in::which of the following best describes your job role(s)? | could you tell us your age range? | what country do you live in? | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Yes | NaN | NaN | JavaScript | NaN | PHP | NaN | NaN | Bash / Shell | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 60 or older | Italy |
1 | Yes | NaN | NaN | JavaScript | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | Team lead | NaN | NaN | NaN | NaN | 40-49 | United Kingdom |
2 | Yes | NaN | NaN | JavaScript | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40-49 | France |
3 | No, I don’t use Python for my current projects | NaN | NaN | NaN | NaN | NaN | C# | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 17 or younger | Spain |
4 | Yes | NaN | Java | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 18-20 | Israel |
5 rows × 162 columns
survey_df.shape
(9506, 162)
def find_cols(df, kws):
'''找到 df 中含有 kws 的列'''
return [item for item in df.columns if all ([w in item for w in kws])]
find_cols(df=survey_df, kws=['python', 'version'])
['which version of python do you use the most?', 'installer from python.org:what do you typically use to upgrade your python version?', 'build from source:what do you typically use to upgrade your python version?', 'automatic upgrade via cloud provider:what do you typically use to upgrade your python version?', 'enthought:what do you typically use to upgrade your python version?', 'anaconda:what do you typically use to upgrade your python version?', 'activepython:what do you typically use to upgrade your python version?', 'intel distribution for python:what do you typically use to upgrade your python version?', 'os-provided python (via apt-get, yum, homebrew, etc.):what do you typically use to upgrade your python version?', 'pyenv:what do you typically use to upgrade your python version?', 'pythonz:what do you typically use to upgrade your python version?', 'other - write in::what do you typically use to upgrade your python version?']
python_version = survey_df['which version of python do you use the most?']
python_version.describe()
count 8112 unique 2 top Python 3 freq 6046 Name: which version of python do you use the most?, dtype: object
python_version.value_counts(normalize=True, dropna=False)
Python 3 0.636019 Python 2 0.217336 NaN 0.146644 Name: which version of python do you use the most?, dtype: float64
python_version.value_counts(normalize=False, dropna=False)
Python 3 6046 Python 2 2066 NaN 1394 Name: which version of python do you use the most?, dtype: int64
python_version.value_counts(normalize=True, dropna=True)
Python 3 0.745316 Python 2 0.254684 Name: which version of python do you use the most?, dtype: float64
python_version.value_counts(normalize=True, dropna=True).plot(kind='pie',
figsize=(5, 5),
startangle=90,
autopct='%.0f%%',
fontsize=14,
colors=sns.color_palette('rainbow')[:2])
plt.title('Python 2 VS Python 3', fontsize=18)
plt.ylabel('');
plt.tight_layout()
plt.savefig('python-version.png')
在使用 Python 的开发者中,大概有 75% 的人已经在使用 Python 3 了。
python_da_ml = survey_df[['machine learning:\xa0what do you use python for?', 'data analysis:\xa0what do you use python for?', 'which version of python do you use the most?']]
python_da_ml.dtypes
machine learning: what do you use python for? object data analysis: what do you use python for? object which version of python do you use the most? object dtype: object
python_da = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['data analysis:\xa0what do you use python for?'], normalize=True)
python_da
data analysis: what do you use python for? | Data analysis |
---|---|
which version of python do you use the most? | |
Python 2 | 0.233177 |
Python 3 | 0.766823 |
python_ml = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['machine learning:\xa0what do you use python for?'], normalize=True)
pd.concat([python_da, python_ml], axis=1)
Data analysis | Machine learning | |
---|---|---|
which version of python do you use the most? | ||
Python 2 | 0.233177 | 0.193548 |
Python 3 | 0.766823 | 0.806452 |
pd.concat([python_da, python_ml], axis=1).T.plot(kind='bar', figsize=(10, 5), color=sns.color_palette('rainbow'))
plt.xticks(rotation=0, fontsize=14)
plt.title('Data Analysis and Machine Learning VS Python version', fontsize=18)
plt.legend(title=None)
plt.tight_layout()
plt.savefig('data-analysis-machine-learning-vs-python-version.png')
cols = find_cols(survey_df, 'what framework(s) do you use in addition to python?')
cols
['none:what framework(s) do you use in addition to python?', 'django:what framework(s) do you use in addition to python?', 'flask:what framework(s) do you use in addition to python?', 'tornado:what framework(s) do you use in addition to python?', 'bottle:what framework(s) do you use in addition to python?', 'web2py:what framework(s) do you use in addition to python?', 'numpy / pandas / matplotlib / scipy and similar:what framework(s) do you use in addition to python?', 'keras / theano / tensorflow / scikit-learn and similar:what framework(s) do you use in addition to python?', 'pillow:what framework(s) do you use in addition to python?', 'pyqt / pygtk / wxpython:what framework(s) do you use in addition to python?', 'tkinter:what framework(s) do you use in addition to python?', 'pygame:what framework(s) do you use in addition to python?', 'cherrypy:what framework(s) do you use in addition to python?', 'twisted:what framework(s) do you use in addition to python?', 'pyramid:what framework(s) do you use in addition to python?', 'requests:what framework(s) do you use in addition to python?', 'asyncio:what framework(s) do you use in addition to python?', 'kivy:what framework(s) do you use in addition to python?', 'six:what framework(s) do you use in addition to python?', 'aiohttp:what framework(s) do you use in addition to python?', 'other - write in::what framework(s) do you use in addition to python?', 'cloud platforms (google app engine, aws, rackspace, heroku and similar):what additional technology(s) do you use in addition to python?', 'jupyter notebook:what editor(s)/ide(s) have you considered for use in your python development?', 'komodo editor:what editor(s)/ide(s) have you considered for use in your python development?', 'komodo ide:what editor(s)/ide(s) have you considered for use in your python development?']
frameworks = survey_df[cols[1:]]
frameworks.head()
django:what framework(s) do you use in addition to python? | flask:what framework(s) do you use in addition to python? | tornado:what framework(s) do you use in addition to python? | bottle:what framework(s) do you use in addition to python? | web2py:what framework(s) do you use in addition to python? | numpy / pandas / matplotlib / scipy and similar:what framework(s) do you use in addition to python? | keras / theano / tensorflow / scikit-learn and similar:what framework(s) do you use in addition to python? | pillow:what framework(s) do you use in addition to python? | pyqt / pygtk / wxpython:what framework(s) do you use in addition to python? | tkinter:what framework(s) do you use in addition to python? | ... | requests:what framework(s) do you use in addition to python? | asyncio:what framework(s) do you use in addition to python? | kivy:what framework(s) do you use in addition to python? | six:what framework(s) do you use in addition to python? | aiohttp:what framework(s) do you use in addition to python? | other - write in::what framework(s) do you use in addition to python? | cloud platforms (google app engine, aws, rackspace, heroku and similar):what additional technology(s) do you use in addition to python? | jupyter notebook:what editor(s)/ide(s) have you considered for use in your python development? | komodo editor:what editor(s)/ide(s) have you considered for use in your python development? | komodo ide:what editor(s)/ide(s) have you considered for use in your python development? | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Django | Flask | Tornado | NaN | NaN | NumPy / pandas / Matplotlib / scipy and similar | NaN | Pillow | NaN | NaN | ... | Requests | NaN | NaN | six | NaN | Other - Write In: | NaN | NaN | NaN | NaN |
2 | Django | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Requests | NaN | NaN | six | NaN | NaN | NaN | NaN | NaN | Komodo IDE |
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NumPy / pandas / Matplotlib / scipy and similar | Keras / Theano / TensorFlow / scikit-learn and... | Pillow | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
count_df = frameworks.count().sort_values(ascending=False)
count_df.index = [item.split(':')[0] for item in count_df.index]
count_df.plot(kind='bar', color=sns.color_palette('rainbow', frameworks.shape[1]))
plt.xticks(fontsize=14)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), <a list of 24 Text xticklabel objects>)
values = frameworks.count().sort_values(ascending=False).values
labels = [item.split(':')[0] for item in frameworks.count().sort_values(ascending=False).index]
plt.figure(figsize=(20, 17))
sns.barplot(x=values, y=labels, orient='h', palette=sns.color_palette("rainbow", 24))
plt.xticks(fontsize=14)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig('frameworks.png')
python_ver = survey_df['which version of python do you use the most?']
def process_col(col):
return pd.crosstab(index=python_ver, columns=col).iloc[:, 0]
# process_col(frameworks['django:what framework(s) do you use in addition to python?'])
frameworks_pyver = frameworks.apply(lambda col: pd.crosstab(index=python_ver, columns=col).iloc[:, 0])
frameworks_pyver.columns = [item.split(':')[0] for item in frameworks.columns]
frameworks_pyver
django | flask | tornado | bottle | web2py | numpy / pandas / matplotlib / scipy and similar | keras / theano / tensorflow / scikit-learn and similar | pillow | pyqt / pygtk / wxpython | tkinter | ... | requests | asyncio | kivy | six | aiohttp | other - write in | cloud platforms (google app engine, aws, rackspace, heroku and similar) | jupyter notebook | komodo editor | komodo ide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
which version of python do you use the most? | |||||||||||||||||||||
Python 2 | 841 | 678 | 144 | 83 | 97 | 727 | 264 | 333 | 299 | 175 | ... | 763 | 95 | 70 | 237 | 44 | 223 | 551 | 346 | 43 | 59 |
Python 3 | 2522 | 1929 | 366 | 199 | 235 | 2436 | 1096 | 924 | 830 | 763 | ... | 2006 | 664 | 319 | 389 | 395 | 426 | 1409 | 1394 | 121 | 126 |
2 rows × 24 columns
frameworks_pyver_ratio = frameworks_pyver / frameworks_pyver.sum(axis=0)
frameworks_pyver_ratio.T.plot(kind='bar', color=sns.color_palette('rainbow'))
plt.xticks(rotation=90, fontsize=14)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), <a list of 24 Text xticklabel objects>)
df = frameworks_pyver_ratio.stack().reset_index()
df.columns=['pyver', 'framework', 'value']
df.head()
pyver | framework | value | |
---|---|---|---|
0 | Python 2 | django | 0.250074 |
1 | Python 2 | flask | 0.260069 |
2 | Python 2 | tornado | 0.282353 |
3 | Python 2 | bottle | 0.294326 |
4 | Python 2 | web2py | 0.292169 |
plt.figure(figsize=(20, 17))
sns.barplot(x='value', y='framework', hue='pyver', data=df, orient='h', palette=sns.color_palette('rainbow'))
plt.yticks(fontsize=18)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), <a list of 24 Text yticklabel objects>)