Will your plane be late tomorrow? We'll use Pearson Correlation over a 70 million flights dataset.
Watch these queries live at http://youtu.be/tqS4vZ2Rxlo
import pandas as pd
import numpy
First let's learn what is Pearson Correlation.
# temperature vs ice_cream, fill the series
temperatures = pd.Series([21,30,16,18])
icecreams_sold = pd.Series([70,90,35,40])
temperatures.plot()
icecreams_sold.plot()
<matplotlib.axes.AxesSubplot at 0x43be250>
# x,y plot
plt.scatter(temperatures, icecreams_sold)
<matplotlib.collections.PathCollection at 0x452e690>
# calc correlation
temperatures.corr(icecreams_sold)
0.9479496280955706
# negative correlation? fill the series
coats_sold = pd.Series([25, 10, 47, 40])
plt.scatter(temperatures, coats_sold)
<matplotlib.collections.PathCollection at 0x4cf9510>
# calc corr
coats_sold.corr(temperatures)
-0.96746634288595723
#bigquery imports
import bq
import datetime
import pandas as pd
client = bq.Client.Get()
We have a cool web UI too! You can run all the following queries at https://bigquery.cloud.google.com/.
# first 10 days of NY
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
LIMIT 10
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:10]
Waiting on bqjob_r797746b2216394db_00000140f53399a7_1 ... (0s) Current status: DONE
[[u'2002-01-01', u'NY', u'6.125506072874494', u'494'], [u'2002-01-02', u'NY', u'11.340782122905027', u'537'], [u'2002-01-03', u'NY', u'16.863636363636363', u'528'], [u'2002-01-04', u'NY', u'8.84981684981685', u'546'], [u'2002-01-05', u'NY', u'5.15483870967742', u'465'], [u'2002-01-06', u'NY', u'12.728880157170924', u'509'], [u'2002-01-07', u'NY', u'9.858757062146893', u'531'], [u'2002-01-08', u'NY', u'3.478181818181818', u'550'], [u'2002-01-09', u'NY', u'3.996389891696751', u'554'], [u'2002-01-10', u'NY', u'1.3776978417266188', u'556']]
# last 10 days of NY
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1 DESC
LIMIT 10
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:10]
Waiting on bqjob_r54972aee4745abad_00000140f534278a_2 ... (0s) Current status: DONE
[[u'2012-12-31', u'NY', u'7.752791068580542', u'627'], [u'2012-12-30', u'NY', u'22.181286549707604', u'684'], [u'2012-12-29', u'NY', u'23.13673805601318', u'607'], [u'2012-12-28', u'NY', u'17.09874826147427', u'719'], [u'2012-12-27', u'NY', u'40.883156297420335', u'659'], [u'2012-12-26', u'NY', u'40.486486486486484', u'666'], [u'2012-12-25', u'NY', u'5.826860841423948', u'618'], [u'2012-12-24', u'NY', u'7.156704361873991', u'619'], [u'2012-12-23', u'NY', u'16.78826895565093', u'699'], [u'2012-12-22', u'NY', u'22.182527301092044', u'641']]
# plot NY by day
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
Waiting on bqjob_r8588c34d5409c7d_00000140f53457fd_3 ... (0s) Current status: DONE
[<matplotlib.text.Text at 0x387ed50>, <matplotlib.text.Text at 0x388f050>, <matplotlib.text.Text at 0x3866090>, <matplotlib.text.Text at 0x3866710>, <matplotlib.text.Text at 0x3866c50>, <matplotlib.text.Text at 0x39ce310>]
# too much data, plot NY by month
query = """
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
""
Waiting on bqjob_r258ba2c835b8073d_00000140f5347c23_4 ... (0s) Current status: DONE
''
#comparing NY to NJ
query = """
SELECT a.date, a.departure_state, a.avg, b.departure_state, b.avg FROM
(
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5) a
JOIN
(
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NJ'
GROUP BY 1,2 HAVING c > 5) b
ON a.date=b.date
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
x[4] = float(x[4])
data[0:30]
Waiting on bqjob_r6576de1edb3d4e37_00000140f584a723_17 ... (0s) Current status: DONE
[[u'2002-01', u'NY', 4.224489795918367, u'NJ', 2.66523400191022], [u'2002-02', u'NY', 2.6769026254569623, u'NJ', 2.0702388637830857], [u'2002-03', u'NY', 5.243683589138135, u'NJ', 3.5450068712780576], [u'2002-04', u'NY', 4.026859869234847, u'NJ', 3.697752548656163], [u'2002-05', u'NY', 4.3853169432957575, u'NJ', 2.080332409972299], [u'2002-06', u'NY', 7.976488302061617, u'NJ', 6.910594439117929], [u'2002-07', u'NY', 7.509252120277564, u'NJ', 5.460073428178063], [u'2002-08', u'NY', 5.807095465262927, u'NJ', 3.749971869022167], [u'2002-09', u'NY', 1.3849529046978295, u'NJ', 0.6082779009608278], [u'2002-10', u'NY', 2.0596776854958474, u'NJ', 2.9180081065431382], [u'2002-11', u'NY', 3.709320788003528, u'NJ', 2.44229328003997], [u'2002-12', u'NY', 10.089354760753988, u'NJ', 8.9136109384711], [u'2003-01', u'NY', 5.021275671920455, u'NJ', 4.340754431655885], [u'2003-02', u'NY', 8.710468502528993, u'NJ', 5.20531164249314], [u'2003-03', u'NY', 4.462194599228461, u'NJ', 5.039822658538516], [u'2003-04', u'NY', 2.7360550999139064, u'NJ', 2.393997498957899], [u'2003-05', u'NY', 2.407901907356948, u'NJ', 4.340288924558587], [u'2003-06', u'NY', 4.321551843158912, u'NJ', 5.14180790960452], [u'2003-07', u'NY', 6.881477544512357, u'NJ', 6.968390579766841], [u'2003-08', u'NY', 12.440087743350698, u'NJ', 14.168986083499005], [u'2003-09', u'NY', 1.8673290749940856, u'NJ', 7.626663210644548], [u'2003-10', u'NY', 2.1025048854148163, u'NJ', 2.703413885692367], [u'2003-11', u'NY', 5.279374471682164, u'NJ', 5.883586525759577], [u'2003-12', u'NY', 10.662271710771524, u'NJ', 11.625710285761343], [u'2004-01', u'NY', 9.86022422617597, u'NJ', 7.841423684639192], [u'2004-02', u'NY', 4.871669700479894, u'NJ', 4.293862874887636], [u'2004-03', u'NY', 5.9521935188015895, u'NJ', 7.606765012665773], [u'2004-04', u'NY', 5.019260106788711, u'NJ', 7.0949304552709584], [u'2004-05', u'NY', 7.749483510597598, u'NJ', 10.789101203113942], [u'2004-06', u'NY', 8.488849516619805, u'NJ', 11.050073637702504]]
#plot NJ vs NY
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
[<matplotlib.text.Text at 0x3876090>, <matplotlib.text.Text at 0x43c8490>, <matplotlib.text.Text at 0x4677c50>, <matplotlib.text.Text at 0x4657310>, <matplotlib.text.Text at 0x4657850>, <matplotlib.text.Text at 0x4657ed0>, <matplotlib.text.Text at 0x4658890>, <matplotlib.text.Text at 0x4658f50>]
#scatter plot NY vs NJ
plt.scatter(frame[2],frame[4])
plt.xlabel('avg NY delay in minutes')
plt.ylabel('avg NJ delay in minutes')
frame[2].corr(frame[4])
0.86342001714668237
# repeat NJ vs HI
query = """
SELECT a.date, a.departure_state, b.departure_state, a.avg, b.avg corr
FROM
(SELECT LEFT(date, 7) date, departure_state , AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5 ) a
JOIN
(SELECT LEFT(date, 7) date, departure_state, AVG(departure_delay) avg, COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'HI'
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
ORDER BY a.date;
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[3] = float(x[3])
x[4] = float(x[4])
Waiting on bqjob_r44518860fd483d1_00000140f5859af8_18 ... (0s) Current status: DONE
#NY vs HI
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
frame[3].corr(frame[4])
0.13278480069610055
plt.scatter(frame[3],frame[4])
plt.xlabel('avg NY delay in minutes')
plt.ylabel('avg Hawaii delay in minutes')
print "Correlation: %s" % frame[3].corr(frame[4])
Correlation: 0.132784800696
# The best correlations
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
WHERE a.departure_state < b.departure_state
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
Waiting on bqjob_r4c1bdc992d6e95dc_00000140f5362253_7 ... (0s) Current status: DONE
[[u'NJ', u'NY', u'0.8338301570594732', u'3984'], [u'NC', u'VA', u'0.8245167702974038', u'3987'], [u'IL', u'WI', u'0.815321475058135', u'3987'], [u'IN', u'OH', u'0.8000256762959715', u'3987'], [u'KY', u'OH', u'0.7996098888549095', u'3987'], [u'MD', u'VA', u'0.7994796773926516', u'3982'], [u'FL', u'NC', u'0.7894417732293801', u'3987'], [u'IN', u'KY', u'0.7884110703532603', u'3987'], [u'PA', u'VA', u'0.7873618693153164', u'3987'], [u'IA', u'IL', u'0.7870630148407264', u'3987'], [u'NY', u'VA', u'0.7829532870030526', u'3987'], [u'IL', u'NE', u'0.7804910533256546', u'3987'], [u'NY', u'PA', u'0.7723228041983351', u'3987'], [u'IN', u'TN', u'0.7720173161040389', u'3987'], [u'MO', u'TN', u'0.7712146971483247', u'3987'], [u'AL', u'TN', u'0.7699968918541833', u'3987'], [u'AL', u'GA', u'0.769926850668077', u'3987'], [u'NH', u'RI', u'0.7685121873582897', u'3982'], [u'LA', u'TX', u'0.7659147383876882', u'3987'], [u'NC', u'PA', u'0.7658885387555227', u'3987'], [u'AZ', u'NV', u'0.7649309739242285', u'3987'], [u'PA', u'RI', u'0.762151254768136', u'3983'], [u'IL', u'IN', u'0.7602999454899928', u'3987'], [u'NC', u'SC', u'0.7599302716906835', u'3987'], [u'IN', u'MI', u'0.7597632943720553', u'3987'], [u'NY', u'RI', u'0.7580197224289437', u'3983'], [u'AL', u'MS', u'0.7561347292591919', u'3986'], [u'MA', u'NY', u'0.7556376488071689', u'3985'], [u'IN', u'WI', u'0.754066361397049', u'3987'], [u'AL', u'SC', u'0.7534529056655519', u'3987']]
# The best predictors
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
Waiting on bqjob_r32e6eb6e4c9874e9_00000140f53720fe_8 ... (0s) Current status: DONE
[[u'WA', u'WA', u'0.6297898169456775', u'3985'], [u'OR', u'OR', u'0.5680633908435759', u'3985'], [u'UT', u'UT', u'0.5518993347477938', u'3985'], [u'AZ', u'AZ', u'0.5475550858757765', u'3985'], [u'CA', u'CA', u'0.5443100833617516', u'3985'], [u'OR', u'WA', u'0.5302798605673441', u'3985'], [u'CO', u'CO', u'0.5297917560132875', u'3985'], [u'NC', u'TN', u'0.5172774514631352', u'3985'], [u'NC', u'NC', u'0.5108313637543679', u'3985'], [u'AK', u'AK', u'0.5056712860460064', u'3985'], [u'VA', u'VA', u'0.5047323118512793', u'3985'], [u'FL', u'TN', u'0.5034450221256067', u'3985'], [u'FL', u'FL', u'0.502826355644986', u'3985'], [u'TN', u'TN', u'0.5017351620271352', u'3985'], [u'KY', u'KY', u'0.49957910159123026', u'3985'], [u'SC', u'TN', u'0.49543973189673185', u'3985'], [u'VA', u'TN', u'0.492398752072957', u'3985'], [u'MO', u'MO', u'0.4860324717460569', u'3985'], [u'IN', u'MO', u'0.484540858740931', u'3985'], [u'OK', u'OK', u'0.48317636525504104', u'3983'], [u'OH', u'MO', u'0.4826492949616015', u'3985'], [u'TN', u'MO', u'0.48025616219000855', u'3985'], [u'AZ', u'NV', u'0.4778793553143607', u'3985'], [u'OH', u'TN', u'0.47689189919879454', u'3985'], [u'ID', u'ID', u'0.4750537414307511', u'3985'], [u'KY', u'TN', u'0.4718897090994191', u'3985'], [u'VA', u'MO', u'0.47175614392232296', u'3985'], [u'AR', u'AR', u'0.4708215665034345', u'3985'], [u'KY', u'MO', u'0.46981922926980774', u'3985'], [u'LA', u'TX', u'0.4683406852498902', u'3985']]
# The best NY predictors
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
Waiting on bqjob_r73f81a2da6417c68_00000140f53ae992_13 ... (0s) Current status: DONE
[[u'NY', u'TN', u'0.4604778422876315', u'3985'], [u'NY', u'VA', u'0.4525985794332655', u'3985'], [u'NY', u'MO', u'0.4468373082176345', u'3985'], [u'NY', u'NC', u'0.4437414892121395', u'3985'], [u'NY', u'IN', u'0.42946643366576065', u'3985'], [u'NY', u'OH', u'0.4222869735099268', u'3985'], [u'NY', u'NY', u'0.41533503512414677', u'3985'], [u'NY', u'FL', u'0.41124840488503495', u'3985'], [u'NY', u'MI', u'0.4094548348798478', u'3985'], [u'NY', u'KY', u'0.4067009482378483', u'3985'], [u'NY', u'PA', u'0.404689694730979', u'3985'], [u'NY', u'IL', u'0.39262812794547247', u'3985'], [u'NY', u'CT', u'0.39235729297846395', u'3981'], [u'NY', u'SC', u'0.386602477144127', u'3985'], [u'NY', u'AL', u'0.384011640257245', u'3985'], [u'NY', u'MD', u'0.3810324724938643', u'3980'], [u'NY', u'WI', u'0.3730419992515536', u'3985'], [u'NY', u'RI', u'0.36244171842441797', u'3981'], [u'NY', u'AR', u'0.36115590731660774', u'3985'], [u'NY', u'NE', u'0.3597984334988245', u'3985'], [u'NY', u'NJ', u'0.34369045546341237', u'3982'], [u'NY', u'GA', u'0.33602647196842883', u'3985'], [u'NY', u'LA', u'0.3332696246080804', u'3985'], [u'NY', u'MA', u'0.33093486961307106', u'3983'], [u'NY', u'NH', u'0.3249965562761172', u'3981'], [u'NY', u'IA', u'0.31988290093730404', u'3985'], [u'NY', u'OK', u'0.31878660685572396', u'3984'], [u'NY', u'KS', u'0.31585571149654557', u'3984'], [u'NY', u'VT', u'0.30069202780694454', u'3973'], [u'NY', u'MS', u'0.2993557875886553', u'3984']]
# The best NY predictors, by quarter
query = """SELECT a.q, a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT QUARTER(timestamp(date)) q, date, departure_state , AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2,3 HAVING c > 5
) a
JOIN
(SELECT QUARTER(timestamp(date)) q, DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2,3 HAVING c > 5 ) b
ON a.date=b.date AND a.q = b.q
GROUP EACH BY 1, 2, 3
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
Waiting on bqjob_r68fdf9c52356a2d_00000140f53b417c_14 ... (0s) Current status: DONE
[[u'4', u'NY', u'TN', u'0.53851067476334', u'970'], [u'4', u'NY', u'MO', u'0.5152172520022091', u'970'], [u'4', u'NY', u'IN', u'0.49885726806993796', u'970'], [u'4', u'NY', u'FL', u'0.4928303849234381', u'970'], [u'4', u'NY', u'MI', u'0.48909682769276874', u'970'], [u'4', u'NY', u'NC', u'0.48781253899118443', u'970'], [u'4', u'NY', u'VA', u'0.47846255064825305', u'970'], [u'4', u'NY', u'NE', u'0.4774949028181781', u'970'], [u'4', u'NY', u'AR', u'0.47679508128463516', u'970'], [u'1', u'NY', u'VA', u'0.47457313376436105', u'982'], [u'4', u'NY', u'AL', u'0.47313773278292814', u'970'], [u'4', u'NY', u'SC', u'0.46563316524812737', u'970'], [u'4', u'NY', u'LA', u'0.4646150398714457', u'970'], [u'4', u'NY', u'NY', u'0.4618504486049276', u'970'], [u'4', u'NY', u'KY', u'0.45956516103857753', u'970'], [u'1', u'NY', u'MO', u'0.45927112269889714', u'982'], [u'4', u'NY', u'PA', u'0.4480173413522477', u'970'], [u'2', u'NY', u'TN', u'0.446800533477935', u'990'], [u'3', u'NY', u'OH', u'0.4465535392634808', u'1001'], [u'1', u'NY', u'TN', u'0.44189949064435124', u'982'], [u'4', u'NY', u'OH', u'0.4417318415120308', u'970'], [u'4', u'NY', u'WI', u'0.4408363684332177', u'970'], [u'2', u'NY', u'OH', u'0.4406200062270232', u'990'], [u'4', u'NY', u'IL', u'0.4405578019068183', u'970'], [u'1', u'NY', u'NC', u'0.44051373077047573', u'982'], [u'4', u'NY', u'KS', u'0.4391476663920567', u'970'], [u'2', u'NY', u'MO', u'0.4349768298772989', u'990'], [u'1', u'NY', u'IN', u'0.43422241708881326', u'982'], [u'2', u'NY', u'VA', u'0.43060698481465537', u'990'], [u'4', u'NY', u'GA', u'0.4274920117910521', u'970']]
# Bonus: When is a correlation a good correlation?
nnn = 1100 # Move it from 2 to 4000
s1 = pd.Series(np.random.random(nnn))
s2 = pd.Series(np.random.random(nnn))
plt.scatter(s1, s2)
s1.corr(s2)
-0.023421886876298374
Follow the most interesting BigQuery news at http://www.reddit.com/r/bigquery. Ask your technical questions using the [google-bigquery] tag on Stack Overflow.