%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
# this would be a huge pain to load into a database
mo = pd.read_csv('data/mariano-rivera.csv')
mo.tail()
pandas can fetch data from a URL ...
clean = lambda s: s.replace('$', '')[:-1] if '.' in s else s.replace('$', '') # a lot going on here
url = 'https://raw.github.com/gjreda/best-sandwiches/master/data/best-sandwiches-geocode.tsv'
sandwiches = pd.read_table(url, sep='\t', converters={'price': lambda s: float(clean(s))})
sandwiches.head(3)
gh = pd.read_json('https://api.github.com/repos/pydata/pandas/issues?per_page=3')
gh[['body', 'created_at', 'title', 'url']].head(3)
You'll likely need to do some parsing though - pandas read_json
doesn't do well with nested JSON yet
Possibly my favorite way to read data into pandas ...
clip = pd.read_clipboard()
clip.head()
# All of this is basically the same as it would be with Postgres, MySQL, or any other database
# Just pass pandas a connection object and it'll take care of the rest.
from pandas.io import sql
import sqlite3
conn = sqlite3.connect('data/towed.db')
query = "SELECT * FROM towed"
towed = sql.read_sql(query, con=conn, parse_dates={'date':'%m/%d/%Y'})
towed.head()
We're going to work with the towed dataset for a bit.
We've already been using .head()
, but there's also .tail()
. You can also use standard Python slicing.
towed[100:105]
towed.info() # empty string showing up as non-null value
mo.info() # note the nulls in the Awards column
mo.describe() # basic stats for any numeric column
towed.set_index('date', inplace=True)
# SELECT *
# FROM towed
# WHERE date = '2014-11-04'
# LIMIT 5;
towed.ix['2014-11-04']
towed.ix['2014-11-04', 'make'] # get a Series back (or individual values, if unique)
# SELECT *
# FROM towed
# WHERE date = '2014-11-04';
towed.reset_index(inplace=True)
towed[towed['date'] == '2014-11-04']
towed.set_index('date', inplace=True)
cols = ['make', 'color', 'state'] # just passing a list
towed[cols][:5]
# SELECT *
# FROM towed
# WHERE make = 'FORD'
# ORDER BY state DESC
# LIMIT 5;
towed[towed['make'] == 'FORD'].sort('state', ascending=False).head()
# SELECT *
# FROM towed
# WHERE color = 'BLK'
# AND state = 'MI';
towed[(towed.color == 'BLK') & (towed.state == 'MI')]
# SELECT *
# FROM towed
# WHERE make = 'PORS'
# OR state = 'NY'
# ORDER BY make, color DESC;
towed[(towed.make == 'PORS') | (towed.state == 'NY')].sort(['make', 'color'], ascending=[True, False]) # not red Porshe
m = (towed.style == '4D') & (towed.state == 'IL') # create a filter mask
m.value_counts() # it's just a boolean series
# SELECT make, model, color
# FROM towed
# WHERE style = '4D' AND state = 'IL
# ORDER BY date
# LIMIT 5;
towed[m][['make', 'model', 'color']].sort().head()
towed[towed.duplicated()] # this would suck in SQL ... but that's also what primary keys are for
# thousands comma in tripduration field ... that's be really annoying to load into a DB
! egrep \"[0-9],[0-9]+\" data/divvy/Divvy_Trips_2013.csv | head -n 5
! head -n 5 data/divvy/Divvy_Stations_2013.csv
! head -n 5 data/weather.csv
from datetime import datetime
to_datetime = lambda d: datetime.strptime(d, '%m/%d/%Y %H:%M')
# if you know the datetime format, it's much, much faster to explicitly parse it
# see here: https://gist.github.com/gjreda/7433f5f70299610d9b6b
trips = pd.read_csv('data/divvy/Divvy_Trips_2013.csv',
converters={'starttime': to_datetime,'stoptime': to_datetime},
thousands=',')
stations = pd.read_csv('data/divvy/Divvy_Stations_2013.csv')
trips.info()
stations.info()
# CREATE TABLE divvy (
# SELECT *
# FROM trips
# LEFT JOIN stations
# ON trips.from_station_name = stations.name
# );
divvy = pd.merge(trips, stations, how='left', left_on='from_station_name', right_on='name')
divvy.info()
# SELECT trip_id, ... divvy.name AS name_origin ... stations.name AS name_dest
# FROM divvy
# INNER JOIN stations
# ON divvy.to_station_name = stations.name
divvy = pd.merge(divvy, stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_origin', '_dest'])
divvy.info()
df1 = pd.DataFrame({'a': np.random.randn(3), 'b': np.square(range(100,103))})
df2 = pd.DataFrame({'a': [np.NaN, 11, 99], 'b': np.random.randn(3)})
# SELECT a, b
# FROM df1
# UNION
# SELECT a, b
# FROM df2;
# takes a list, so you can pass 2+ dataframes
# note that the indexes are the same
pd.concat([df1, df2])
# it's precise to the minute - want to see counts by nearest hour
divvy.starttime.value_counts().order()[:3]
# I don't want to just extract the hour from the datetime ... NEAREST hour
from datetime import timedelta
def round_time(dt, mins=60):
"""
Rounds a datetime object to its nearest hour (by default).
(e.g. 2014-02-01 10:45:13 becomes 2014-02-01 11:00:00)
"""
dt += timedelta(minutes=mins/2.)
dt -= timedelta(minutes=dt.minute % mins, seconds=dt.second)
return dt
divvy['starthour'] = divvy.starttime.apply(round_time)
divvy['stophour'] = divvy.stoptime.apply(round_time)
divvy[['starttime', 'starthour']].head(3)
pandas groupby
draws largely from Hadley Wickham's Split Apply Combine Methodology for Data Analysis (it's a good paper; you should read it).
Count vs Size
# SELECT starthour, count(1)
# FROM divvy
# GROUP BY starthour
divvy.groupby('starthour').size()
# SELECT starthour, avg(tripduration)
# FROM divvy
# GROUP BY starthour
# ORDER BY avg(tripduration) DESC
# LIMIT 5
divvy.groupby('starthour')['tripduration'].mean().order(ascending=False)[:5]
divvy.groupby('usertype').agg({'birthyear': pd.Series.nunique, 'tripduration': [np.mean, np.median]})
# number of trips started by starthour
divvy.groupby('starthour').size().plot(figsize=(16,8))
# distribution of tripduration
divvy.tripduration.hist(figsize=(16,8), bins=1000)
plt.xlim(0, 10000);
Sounds like we need to plot the cumulative distribution of trip duration.
(If someone knows of a better way of doing this, I'd love to hear it).
duration_counts = divvy.tripduration.value_counts()
duration_counts.index.name = 'seconds'
duration_counts.name = 'trips'
duration_counts.head()
df = duration_counts.reset_index()
df['minutes'] = df.seconds/60.
df.set_index('minutes', inplace=True)
df.sort(inplace=True)
(df.trips.cumsum() / df.trips.sum()).plot(figsize=(16,8))
plt.xlim(0, 60)
plt.yticks(np.arange(0, 1.1, 0.1));
plt.figure(figsize=(9, 18))
divvy.groupby('birthyear').size().order().plot(kind='barh')
divvy.groupby(divvy['starttime'].apply(lambda d: d.dayofweek))['trip_id'].count().plot(kind='bar')
plt.title('Divvy trips by weekday') # 0 = Monday ...
plt.xlabel('Weekday')
plt.ylabel('# of trips started');
Plotting multiple lines and subplotting.
divvy['startdate'] = divvy.starthour.apply(lambda d: d.date())
by_gender = divvy.groupby(['startdate', 'gender']).size()
by_gender.head()
by_gender.unstack(1).head()
pandas wins here.
# SELECT startdate
# , COUNT(IF(gender = 'Female', 1, NULL))
# , COUNT(IF(gender = 'Male', 1, NULL))
# FROM divvy
# GROUP BY startdate
# LIMIT 5;
divvy.groupby(['startdate', 'gender']).size().unstack(1).head()
by_gender.unstack(1).plot(figsize=(16,8))
divvy.groupby(['startdate', 'usertype']).size().unstack(1).plot(figsize=(16,8), subplots=True);
weekdays = divvy['starttime'].apply(lambda d: d.dayofweek)
hours = divvy['starttime'].apply(lambda d: d.hour)
by_weekday_hour = divvy.groupby([weekdays, hours])['trip_id'].count()
by_weekday_hour.index.names = ['weekday', 'hour'] # rename MultiIndex
by_weekday_hour.unstack(0).plot(figsize=(16,8))
plt.title('Trips by weekday hour')
plt.ylabel('# of trips started')
plt.xlabel('Hour of day')
plt.xticks(range(24))
plt.xlim(0, 23);