%load_ext sql %sql postgresql://:@/pgo %sql SELECT * FROM yob2010 LIMIT 5 %%sql SELECT * FROM yob2010 LIMIT 5 names = %sql SELECT * FROM yob2010 names[2] names[2].sex import pandas as pd !head sample_data/yob2000.txt yob2000 = pd.read_csv('sample_data/yob2000.txt', names=['name','sex','n_births']) yob2000.head() pd.read_excel yob2000.head(6).T yob2000.head(6).T.sum(axis=1) ls sample_data/yob*.txt all_names = pd.DataFrame() for yr in range(2000, 2011): one_yr = pd.read_csv('sample_data/yob%d.txt' % yr, names=['name','sex','n_births']) one_yr['year'] = yr all_names = all_names.append(one_yr, ignore_index=True) all_names.head() all_names.tail() lesleys = all_names[all_names.name.str.startswith('Lesl')] lesleys lesleys.pivot_table('n_births', rows='year', cols='sex', aggfunc=sum) gender = all_names.pivot_table('n_births', rows='name', cols='sex', aggfunc=sum) gender gender['total'] = gender.F + gender.M gender['ratio'] = gender.F / gender.total gender.head() gender['andro'] = -1 * (abs(0.5 - gender.ratio)) gender.sort('andro', ascending=False)[ gender.total > 10000].head() states = pd.read_csv('sample_data/states.csv') states states.drop_duplicates(['state','capitol']) states.drop_duplicates(['state','capitol'],take_last=True) %sql SELECT count(*) FROM yob2010 import sqlalchemy as sa db_engine = sa.create_engine('postgresql://:@/pgo') allpeople = pd.read_sql_table('yob2010', db_engine) xmen = allpeople[allpeople.name.str.startswith('X')] xmen.to_sql('xmen', db_engine) %sql SELECT * FROM xmen LIMIT 5 chris = %sql SELECT * FROM yob2010 WHERE name LIKE 'Chris%' chris[:5] chrises = chris.DataFrame() chrises.head() %config SqlMagic.autopandas = True erics = %sql SELECT * FROM yob2010 WHERE name LIKE 'Eric%' erics.head() %sql SELECT count(*) FROM erics %sql PERSIST erics %sql SELECT * FROM erics LIMIT 5 cat sample_data/animals.csv !ddlgenerator postgresql sample_data/animals.csv !ddlgenerator --inserts postgresql sample_data/animals.csv | psql pgo %sql SELECT * FROM animals !ddlgenerator postgresql --inserts http://en.wikipedia.org/wiki/List_of_cities_in_Ohio | psql pgo > /dev/null %sql SELECT * FROM list_of_cities_in_ohio LIMIT 5; !cat sample_data/knights.yaml !ddlgenerator postgresql sample_data/knights.yaml !ddlgenerator --inserts postgresql sample_data/knights.yaml | psql pgo %sql select * from knights %sql select * from followers import matplotlib matplotlib.use('nbagg') lesleys.head() lesley_summary = lesleys.pivot_table('n_births', rows='year', cols='sex', aggfunc=sum) lesley_summary lesley_summary.plot() lesleys.pivot_table('n_births', rows='name', cols='sex', aggfunc='sum') lesleys.pivot_table('n_births', rows='name', cols='sex', aggfunc='sum' ).plot(kind='barh', stacked=True) yob2000.head() yob2000[yob2000.n_births > 1000].hist(bins=10) gsum = gender[(gender.total > 10000) & (gender.andro > -0.3)] \ .sort('andro', ascending=False) gsum.head() gsum['len'] = gsum.index.map(len) gsum.plot(kind='scatter', x='len', y='ratio', s=gsum.total/100, alpha=0.3) import mpld3 mpld3.enable_notebook() gsum.plot(kind='scatter', x='len', y='ratio', s=gsum.total/100, alpha=0.3) import matplotlib.pyplot as plt fig, ax = plt.subplots() points = ax.scatter(gsum.len, gsum.ratio, s=gsum.total/100, alpha=0.3) tooltip = mpld3.plugins.PointLabelTooltip(points, list(gsum.index)) mpld3.plugins.connect(fig, tooltip) yob2000_common = yob2000[yob2000.n_births > 5000] hacknied_gender_color_map = {'F': 'red', 'M': 'blue'} hgcm = hacknied_gender_color_map yob2000_common.sex.map(hgcm) import bokeh.plotting as bp bp.scatter(yob2000_common.name.str.len(), yob2000_common.n_births, color=yob2000_common.sex.map(hgcm), fill_alpha=0.2, size=10) bp.output_notebook() bp.curplot().title = "Name length vs. number of births" bp.show()