#!/usr/bin/env python # coding: utf-8 # In[55]: import plotly.graph_objs as go import plotly.plotly as py # from plotly import tools from plotly.offline import init_notebook_mode import plotly.offline as offline import numpy as np import pandas as pd import seaborn as sns from matplotlib.colors import rgb2hex # In[56]: init_notebook_mode(connected=False) # In[57]: # tools.set_credentials_file(username='your_username', api_key='your_apikey') # In[58]: survey_df = pd.read_csv('pythondevsurvey2017_raw_data.csv') survey_df.columns = [c.lower() for c in survey_df.columns] survey_df.head() # In[59]: def find_cols(df, kws): '''找到 df 中含有 kws 的列''' return [item for item in df.columns if all ([w in item for w in kws])] # ## 散点图/折线图 # In[60]: # mode 可以是 ['lines', 'markers', 'text'] 三者的任意组合 # 颜色支持 rgb 和 十六进制格式 # 对于大数据量可以使用 go.Scattergl 进行绘制 trace1 = go.Scatter(x=np.arange(100), y = np.random.rand(100), mode='lines+markers', marker={ 'color': 'red' }) data = [trace1] layout = { 'title': '散点+折线' } # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig, show_link=False) offline.iplot(fig, show_link=False) # ## 使用 Python 2 和 Python 3 的开发者的比例? # In[61]: python_version = survey_df['which version of python do you use the most?'] counts = python_version.value_counts() counts # In[62]: labels = counts.index values = counts.values colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] trace = go.Pie(labels=labels, values=values, marker={ 'colors': colors }, rotation=0, hoverinfo='label+value') data = [trace] layout = { 'title': 'Python 2 VS Python 3' } # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 做数据分析和机器学习的人中分别有多少人使用的是 Python 3? # In[63]: python_da_ml = survey_df[['machine learning:\xa0what do you use python for?', 'data analysis:\xa0what do you use python for?', 'which version of python do you use the most?']] python_da = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['data analysis:\xa0what do you use python for?'], normalize=True) python_ml = pd.crosstab(python_da_ml['which version of python do you use the most?'], python_da_ml['machine learning:\xa0what do you use python for?'], normalize=True) # In[64]: da_ml = pd.concat([python_da, python_ml], axis=1) da_ml # In[65]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] python2 = go.Bar(x=da_ml.columns, y=da_ml.loc['Python 2'], name='Python 2', marker={'color': colors[0]}) python3 = go.Bar(x=da_ml.columns, y=da_ml.loc['Python 3'], name='Python 3', marker={'color': colors[1]}) data = [python2, python3] # go.FigureWidget(data) fig = go.Figure(data=data) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 做数据分析和机器学习的人常用的框架? # In[66]: cols = find_cols(survey_df, 'what framework(s) do you use in addition to python?') frameworks = survey_df[cols[1:]] count_df = frameworks.count().sort_values(ascending=False) count_df.index = [item.split(':')[0] for item in count_df.index] count_df.head() # In[67]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', len(count_df))] trace = go.Bar(x=count_df.index, y=count_df.values, marker={'color': colors}) data = [trace] layout = {'title': 'Framework Usage'} # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[68]: # 对于 Y 轴刻度标签太长的情况,可以设置 layout 中 yaxis 的 automargin 属性为 True # 也可以自定义 margin trace = go.Bar(y=count_df.index[::-1], x=count_df.values[::-1], marker={'color': colors[::-1]}, orientation='h') data = [trace] layout = go.Layout( title='Framework Usage', margin={'r': 10}, height=1000, yaxis={'automargin': True} ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[69]: frameworks_pyver = frameworks.apply(lambda col: pd.crosstab(index=python_version, columns=col).iloc[:, 0]) frameworks_pyver = frameworks_pyver / frameworks_pyver.sum(axis=0) frameworks_pyver.columns = [item.split(':')[0] for item in frameworks.columns] frameworks_pyver # In[70]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Bar(x=frameworks_pyver.columns, y=frameworks_pyver.loc['Python 2'], marker={'color': colors[0]}, name='Python 2') py3 = go.Bar(x=frameworks_pyver.columns, y=frameworks_pyver.loc['Python 3'], marker={'color': colors[1]}, name='Python 3') data = [py2, py3] layout = go.Layout(title='Python 2 and Python 3 Usage among Frameworks') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[71]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Bar(y=frameworks_pyver.columns, x=frameworks_pyver.loc['Python 2'], marker={'color': colors[0]}, orientation='h', name='Python 2') py3 = go.Bar(y=frameworks_pyver.columns, x=frameworks_pyver.loc['Python 3'], marker={'color': colors[1]}, orientation='h', name='Python 3') data = [py2, py3] layout = go.Layout(title='Python 2 and Python 3 Usage among Frameworks', height=1000, yaxis={'automargin': True}) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[72]: import plotly.figure_factory as ff # In[73]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] fig = ff.create_distplot(hist_data=[frameworks_pyver.loc['Python 2'], frameworks_pyver.loc['Python 3']], group_labels=['Python 2', 'Python 3'], bin_size=0.05, colors=colors) # go.FigureWidget(fig) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[74]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Scatter(x=frameworks_pyver.columns, y=frameworks_pyver.loc['Python 2'], mode='markers', marker={'color': colors[0]}, name='Python 2') py3 = go.Scatter(x=frameworks_pyver.columns, y=frameworks_pyver.loc['Python 3'], mode='markers', marker={'color': colors[1]}, name='Python 3') data = [py2, py3] layout = go.Layout(title='Python 2 and Python 3 Usage among Frameworks') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[75]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Scatter(y=frameworks_pyver.columns[::-1], x=frameworks_pyver.loc['Python 2'][::-1], mode='markers', marker={'color': colors[0]}, orientation='h', name='Python 2') py3 = go.Scatter(y=frameworks_pyver.columns[::-1], x=frameworks_pyver.loc['Python 3'][::-1], mode='markers', marker={'color': colors[1]}, orientation='h', name='Python 3') data = [py2, py3] layout = go.Layout( title='Python 2 and Python 3 Usage among Frameworks', margin={'r': 10}, height=1000, yaxis={'automargin': True} ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[76]: frameworks_pyver # In[77]: sorted_frameworks_pyver = frameworks_pyver.sort_values(by='Python 3', axis=1, ascending=False) sorted_frameworks_pyver # In[78]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Scatter(y=sorted_frameworks_pyver.columns[::-1], x=sorted_frameworks_pyver.loc['Python 2'][::-1], mode='markers', marker={'color': colors[0]}, orientation='h', name='Python 2') py3 = go.Scatter(y=sorted_frameworks_pyver.columns[::-1], x=sorted_frameworks_pyver.loc['Python 3'][::-1], mode='markers', marker={'color': colors[1]}, orientation='h', name='Python 3') data = [py2, py3] layout = go.Layout( title='Python 2 and Python 3 Usage among Frameworks', margin={'r': 10}, height=1000, yaxis={'automargin': True} ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 做数据分析和机器学习的人常用的框架? # In[79]: cols = find_cols(survey_df, ['use', 'python', 'most']) uses = survey_df['what do you use python for the most?'] frameworks_uses = frameworks.apply(lambda col: pd.crosstab(index=uses, columns=col).iloc[:, 0]) frameworks_uses.columns = [item.split(':')[0] for item in frameworks_uses.columns] frameworks_uses.head() # In[80]: da_ml_frameworks_uses = frameworks_uses.loc[['Data analysis', 'Machine learning']] # In[81]: # fill 的可选值为:['none', 'tozeroy', 'tozerox', 'tonexty', 'tonextx', 'toself', 'tonext'] colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] da = go.Scatter(x=da_ml_frameworks_uses.columns, y=da_ml_frameworks_uses.loc['Data analysis'], fill='tozeroy', marker={'color': colors[0]}, name='Python 2') ml = go.Scatter(x=da_ml_frameworks_uses.columns, y=da_ml_frameworks_uses.loc['Machine learning'], fill='tozeroy', marker={'color': colors[1]}, name='Python 3') data = [da, ml] layout = go.Layout(title='Frameworks Usage among Data Analysis and Machine Learning Developers') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[82]: sorted_da_ml_frameworks_uses = da_ml_frameworks_uses.sort_values(by='Data analysis', axis=1, ascending=True) colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] da = go.Scatter(x=sorted_da_ml_frameworks_uses.columns, y=sorted_da_ml_frameworks_uses.loc['Data analysis'], fill='tozeroy', marker={'color': colors[0]}, name='Python 2') ml = go.Scatter(x=sorted_da_ml_frameworks_uses.columns, y=sorted_da_ml_frameworks_uses.loc['Machine learning'], fill='tozeroy', marker={'color': colors[1]}, name='Python 3') data = [da, ml] layout = go.Layout(title='Frameworks Usage among Data Analysis and Machine Learning Developers') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[83]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] da = go.Bar(y=sorted_da_ml_frameworks_uses.columns, x=sorted_da_ml_frameworks_uses.loc['Data analysis'], marker={'color': colors[0]}, orientation='h', name='Data analysis') ml = go.Bar(y=sorted_da_ml_frameworks_uses.columns, x=sorted_da_ml_frameworks_uses.loc['Machine learning'], marker={'color': colors[1]}, orientation='h', name='Machine learning') data = [da, ml] layout = go.Layout(title='Frameworks Usage among Data Analysis and Machine Learning Developers', height=1000, yaxis={'automargin': True}) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 公司规模大小和是否使用 Python 3 的关系? # In[84]: cols = find_cols(survey_df, ['how', 'many', 'people', 'project']) team_scale = survey_df[cols[0]] team_pyver = pd.crosstab(team_scale, python_version) team_pyver = team_pyver.reindex(['2-7 people', '8-12 people', '13-20 people', '21-40 people', 'More than 40 people']) team_pyver_sorted = team_pyver.div(team_pyver.sum(axis=1), axis=0).sort_values(by='Python 3', ascending=False) team_pyver_sorted # In[85]: trace = go.Scatter(x=team_pyver_sorted.index, y=team_pyver_sorted['Python 3'], marker={'color': colors[0]}, mode='lines+markers', line={'width': 2}) data = [trace] layout = go.Layout(title='Team scale VS Use ratio of Python 3') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 开发者年龄和是否使用 Python 3 的关系? # In[86]: cols = find_cols(survey_df, ['age', 'range']) age = survey_df[cols[0]] age_pyver = pd.crosstab(index=age, columns=python_version) age_pyver = age_pyver.div(age_pyver.sum(axis=1), axis=0) age_pyver # In[87]: trace = go.Scatter(x=age_pyver.index, y=age_pyver['Python 3'], marker={'color': colors[0]}, mode='lines+markers', line={'width': 2}, name='Python 3') data = [trace] layout = go.Layout(title="The developers' age VS The use ratio of Python 3") # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[88]: country_age = pd.crosstab([survey_df['what country do you live in?'], survey_df['which version of python do you use the most?']], survey_df['could you tell us your age range?']) country_age_total = country_age.sum(level=0) country_age_total.head() # In[89]: sorted_country_age_total = country_age_total.sort_values(by='60 or older', ascending=False) colors = [rgb2hex(i) for i in sns.color_palette('rainbow', 10)] trace = go.Bar(x=sorted_country_age_total.index[:10], y=sorted_country_age_total.iloc[:10, -1], marker={'color': colors}) data = [trace] layout = {'title': 'Top 10 countries of # of the developers whose age are 60+'} # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[90]: three_countries = country_age_total.loc[['United States', 'India', 'China']] three_countries = three_countries.div(three_countries.sum(axis=1), axis=0) three_countries # In[91]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', 7)] data = [go.Bar(x=three_countries.index, y=three_countries[c], marker={'color': colors[i]}, name=c) for i, c in enumerate(three_countries.columns)] layout = go.Layout(title="Age distribution of the developers who're from USA, India and China") # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 使用 Python 3 和 Python 2 的开发者的国别分布? # In[92]: cols = find_cols(survey_df, ['country', 'live']) countries = survey_df[cols[0]] count_countries = countries.value_counts(ascending=False) countries.head() # In[93]: # 等同于 sns.countplot colors = [rgb2hex(i) for i in sns.color_palette('rainbow', 10)] trace = go.Bar(x=count_countries.index[:10], y=count_countries[:10], marker={'color': colors}) data = [trace] layout = go.Layout(title='Top 10 countries of # of the developers') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[94]: countries_pyver = pd.crosstab(index=countries, columns=python_version) top10_countries = countries_pyver.loc[countries.value_counts()[:10].index] top10_countries = top10_countries.div(top10_countries.sum(axis=1), axis=0) top10_countries.head() # In[95]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] py2 = go.Bar(x=top10_countries.index, y=top10_countries['Python 2'], marker={'color': colors[0]}, name='Python 2') py3 = go.Bar(x=top10_countries.index, y=top10_countries['Python 3'], marker={'color': colors[1]}, name='Python 3') data = [py2, py3] layout = go.Layout(title='Python 2 and Python 3 Usage among Different Countries') # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[96]: countries_pyver_ratio = countries_pyver.div(countries_pyver.sum(axis=1), axis=0) # In[97]: # 等同于 sns.distplot # 注意 hist_data, group_labels, colors 都必须是列表形式,一个元素表示一个数据集 colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] fig = ff.create_distplot(hist_data=[countries_pyver_ratio['Python 3']], group_labels=['Python 3'], bin_size=0.05, colors=[colors[0]]) fig['layout'].update(title='Use ratio of Python 3 in the world') # go.FigureWidget(fig) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[98]: # 等同于 sns.tripplot trace = go.Scatter(x=countries_pyver_ratio['Python 3'], y=['Python 3'] * len(countries_pyver_ratio), mode='markers', marker={'color': colors[0]}) data = [trace] # go.FigureWidget(data) fig = go.Figure(data=data) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[99]: fig = { "data": [{ "type": 'violin', "y": countries_pyver_ratio['Python 3'], "box": { "visible": True }, "line": { "color": 'black' }, "meanline": { "visible": True }, "fillcolor": colors[0], "opacity": 0.6, "x0": 'Total Bill' }], "layout" : { "title": "Use ratio of Python 3 in the world", "yaxis": { "zeroline": False, } } } # go.FigureWidget(fig) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[100]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow')[:2]] trace = go.Violin(x=countries_pyver_ratio['Python 3'], meanline={'visible': True}, box={'visible': True}, fillcolor=colors[0], opacity=0.6, line={'color': 'black'}) layout = go.Layout(title="Use ratio of Python 3 in the world", xaxis={'zeroline': False}) data = [trace] # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # In[101]: # ['equirectangular', 'mercator', 'orthographic', 'natural earth', 'kavrayskiy7', 'miller', 'robinson', 'eckert4', # 'azimuthal equal area', 'azimuthal equidistant', 'conic # equal area', 'conic conformal', 'conic equidistant', # 'gnomonic', 'stereographic', 'mollweide', 'hammer', # 'transverse mercator', 'albers usa', 'winkel tripel', # 'aitoff', 'sinusoidal'] data = [ dict( type = 'choropleth', locations = countries_pyver_ratio.index, locationmode = 'country names', z = countries_pyver_ratio['Python 3'] * 100, text = countries_pyver_ratio.index, colorscale = 'Bluered', autocolorscale = False, reversescale = True, marker = dict( line = dict ( color = 'rgb(180,180,180)', width = 0.5 ) ), colorbar = dict( # autotick = False, ticksuffix = '%', title = 'Percent'), ) ] layout = dict( title = 'Python 3 in the world', geo = dict( showframe = False, showcoastlines = False, projection = dict( type = 'equirectangular' ), ) ) # fig = go.Figure() # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) # py.iplot(fig) offline.iplot(fig, show_link=False) # ## 开发者中使用 IDE 的情况? # In[102]: cols = find_cols(survey_df, ['what', 'editor(s)/ide(s)']) editors = survey_df[cols] editors.columns = [item.split(':')[0] for item in editors.columns] editors.head() # In[103]: count_editors = editors.count().sort_values(ascending=False) count_editors.head() # In[104]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', len(count_editors))] trace = go.Bar(y=count_editors.index[::-1], x=count_editors.values[::-1], marker={'color': colors[::-1]}, orientation='h') data = [trace] layout = go.Layout( title="What editor(s)/IDE(s) have you considered for use in your Python development?", margin={'r': 10}, height=1000, yaxis={'automargin': True} ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) offline.iplot(fig, show_link=False) # In[105]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', len(count_editors))] trace = go.Bar(x=count_editors.index, y=count_editors.values, marker={'color': colors}, orientation='v') data = [trace] layout = go.Layout( title="What editor(s)/IDE(s) have you considered for use in your Python development?", ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) offline.iplot(fig, show_link=False) # In[106]: col = find_cols(survey_df, ['what', 'main', 'editor']) main_editor = survey_df[col[0]] count_main_editor = main_editor.value_counts(ascending=False) count_main_editor.head() # In[107]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', len(count_main_editor))] trace = go.Bar(y=count_main_editor.index[::-1], x=count_main_editor.values[::-1], marker={'color': colors[::-1]}, orientation='h') data = [trace] layout = go.Layout( title="What is the main editor you use for your current python development?", margin={'r': 10}, height=1000, yaxis={'automargin': True} ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) offline.iplot(fig, show_link=False) # In[108]: colors = [rgb2hex(i) for i in sns.color_palette('rainbow', len(count_editors))] trace = go.Bar(x=count_main_editor.index, y=count_main_editor.values, marker={'color': colors}, orientation='v') data = [trace] layout = go.Layout( title="What is the main editor you use for your current python development?", ) # go.FigureWidget(data=data, layout=layout) fig = go.Figure(data=data, layout=layout) offline.iplot(fig, show_link=False) # In[ ]: