%matplotlib inline from datetime import datetime, time import pandas as pd import numpy as np import psycopg2 as pg import pandas.io.sql as sql CRIME_DATE_FORMAT = '%m/%d/%Y %H:%M:%S %p' from sqlalchemy import create_engine con = create_engine('postgresql://wesm:foo@localhost:5432/strata_2015') con table = pd.read_sql("select * from crimes limit 10;", con) crimes = pd.read_csv('sf_crimes.csv') pd.to_datetime(crimes.Date[:1000]) %timeit pd.to_datetime(crimes.Date[:1000]) %timeit pd.to_datetime(crimes.Date[:1000], format=CRIME_DATE_FORMAT) def null_on_error(series, converter): def f(x): try: return converter(x) except (TypeError, ValueError): return np.nan return series.map(f) def number_with_comma(x): try: x = x.replace(',', '') return float(x) except (ValueError, AttributeError): return np.nan def to_time(x): h, m = x.split(':') return time(int(h), int(m)) crimes = pd.read_csv('sf_crimes.csv') crimes.columns = crimes.columns.map(str.lower) crimes.date = pd.to_datetime(crimes.date, format=CRIME_DATE_FORMAT) crimes.time = crimes.time.map(to_time) companies = pd.read_csv('companies.csv', parse_dates=['founded_at', 'first_funding_at']) companies['funding_total_usd'] = null_on_error( companies.pop(' funding_total_usd '), number_with_comma) investments = pd.read_csv('investments.csv') acquisitions = pd.read_csv('acquisitions.csv') rounds = pd.read_csv('rounds.csv') companies.columns = companies.columns.map(str.strip) rounds.columns = rounds.columns.map(str.strip) investments.columns = investments.columns.map(str.strip) acquisitions.columns = acquisitions.columns.map(str.strip) acquisitions['price_amount'] = null_on_error(acquisitions.price_amount, number_with_comma) crimes.loc[0] crimes.groupby('category').size().order(ascending=False) query = """ select category, count(*) as occs from crimes group by 1 order by occs desc; """ pd.read_sql(query, con)[:10] def top(x, k=10): return x.order(ascending=False)[:k] crimes.groupby([crimes.date.dt.year, crimes.time.map(lambda x: x.hour)]).size() query = """ select extract(year from date) as year, extract(hour from time) as hourofday, count(*) as occs from crimes group by 1, 2; """ sql.read_sql(query, con) crimes[:1].T crimes.groupby('category').size().order(ascending=False) category_mapping = { 'LARCENY/THEFT': 'THEFT', 'ROBBERY': 'THEFT', 'VEHICLE THEFT': 'THEFT' } coarse_cat = crimes.category.map(lambda x: category_mapping.get(x, x)) top(crimes.groupby(coarse_cat).size(), 20) #crimes.category.map(category_mapping.__getitem__) crimes.category.map(lambda x: category_mapping[x]) query = """ select case when category in ('LARCENY/THEFT', 'ROBBERY', 'VEHICLE THEFT') then 'THEFT' else category end as coarse_cat, count(*) as occs from crimes group by 1 order by occs desc; """ sql.read_sql(query, con)[:10] keys = [crimes.date.dt.year, crimes.time.map(lambda x: x.hour)] counts = crimes.groupby(keys).size() counts = counts.unstack(0) share = counts / counts.sum() share.plot(kind='bar') crimes['coarse_cat'] = coarse_cat thefts = crimes[crimes.coarse_cat == 'THEFT'] thefts.groupby(thefts.date.dt.weekday).size() day_names = pd.Series(['Weekday'] * 5 + ['Weekend'] * 2) day_names day_type = day_names.take(thefts.date.dt.weekday) thefts.groupby(day_type.values).size() query = """ with t0 as ( select descript, count(*) as occs from crimes group by 1 order by occs desc limit 5 ) select descript, dayofweek, count(*) as occs from crimes where descript in (select descript from t0) group by 1, 2 order by descript, dayofweek; """ sql.read_sql(query, con) query = """ select c.descript, c.dayofweek, count(*) as occs from crimes c inner join ( select descript, count(*) as occs from crimes group by 1 order by occs desc limit 5 ) c2 on c.descript = c2.descript group by 1, 2; """ sql.read_sql(query, con) counts = crimes.groupby('descript').size() counts counts.order(ascending=False)[:5].index # this could be very large K = 5 counts = crimes.groupby('descript').size() top_descripts = counts.order(ascending=False)[:K].index top_descripts filtered = crimes[crimes.descript.isin(top_descripts)] result = (filtered .groupby(['descript', filtered.time.map(lambda x: x.hour)]) .size()) #result result.unstack('descript') acquisitions.price_amount.isnull().value_counts() acqs_known = acquisitions[acquisitions.price_amount.notnull()] acqs_known[acqs_known.price_amount > acqs_known.price_amount.mean()] query = """ select * from acquisitions a where price_amount > ( select avg(price_amount) from acquisitions ); """ sql.read_sql(query, con) companies.loc[0] rounds.loc[0] companies.loc[0] len(companies) query = """ select status, count(*) as count from companies c where not exists ( select 1 from rounds r where c.permalink = r.company_permalink and r.funding_round_type = 'venture' ) and c.founded_at > '2010-01-01' group by 1; """ sql.read_sql(query, con) rounds[:20] companies_with_round = rounds.company_permalink.unique() companies.permalink.isin(companies_with_round) query = """ select year, hour, occs, occs / sum(occs) OVER (PARTITION BY YEAR) as share from ( select extract(year from date) as year, extract(hour from time) as hour, count(*) as occs from crimes group by 1, 2 ) t0 """ results = sql.read_sql(query, con) results keys = [crimes.date.dt.year, crimes.time.map(lambda x: x.hour)] counts = crimes.groupby(keys).size() counts keys = [crimes.date.dt.year, crimes.time.map(lambda x: x.hour)] counts = crimes.groupby(keys).size() counts.name = 'count' counts.index.names = ['year', 'hour'] counts2 = counts.reset_index() counts2 def add_share(x): x['share'] = x['count'] / x['count'].sum() return x counts2.groupby('year').apply(add_share) rounds[:1].T query = """ select company_permalink, funded_at, sum(case when raised_amount_usd is null then 0 else raised_amount_usd end) OVER (PARTITION BY company_permalink ORDER BY funded_at) as cumulative_raised from rounds order by company_permalink; """ sql.read_sql(query, con) acquisitions.price_amount.isnull().value_counts() acqs_known = acquisitions[acquisitions.price_amount.notnull()] acqs_known.price_amount.describe() top = 1000000000. acqs = acqs_known[(acqs_known.price_amount <= top) & (acqs_known.price_amount > 0)] pd.cut(acqs.price_amount, 4) bins = pd.cut(acqs.price_amount, [0, top / 4, top / 2, 3 * top / 4, top]) acqs.groupby(bins).size() # Get Quantile numbers (pd.qcut(acqs.price_amount, 4, labels=False) + 1).value_counts() quartiles = pd.qcut(acqs.price_amount, 4, labels=False) grouped = acqs.groupby(quartiles) def f(x): return pd.Series({ 'lb': x.price_amount.min(), 'ub': x.price_amount.max(), 'num': len(x) }) grouped.apply(f) query = """ with t0 as ( select company_permalink, price_amount from acquisitions a where price_amount is not null and price_amount <= 1000000000 ) select t2.bucket, sum(t2.price_amount), count(*) from ( select t0.company_permalink, t0.price_amount, ceil((t0.price_amount - lb) / ((ub - lb) / 4)) as bucket from t0 cross join ( select min(price_amount) - 1e-14 as lb, max(price_amount) as ub from t0 ) t1 ) t2 group by 1; """ sql.read_sql(query, con) query = """ select bucket, min(price_amount) as lb, max(price_amount) as ub, count(*) from ( select ntile(4) over (order by price_amount) AS bucket, price_amount from acquisitions a where price_amount is not null and price_amount < 1000000000 ) t0 group by 1 order by bucket; """ sql.read_sql(query, con)