We use a single database "cbb" to store various tables
import os
import glob
import datetime as dt
import numpy as np
import pandas as pd
from watchcbb.sql import SQLEngine
sql = SQLEngine('cbb')
df = pd.read_csv('../data/teams.csv')
df.head()
team_id | display_name | flair_name | conference | location | year_start | year_end | color | |
---|---|---|---|---|---|---|---|---|
0 | abilene-christian | Abilene Christian | Abilene Christian Wildcats | Southland | Abilene, Texas | 1971 | 2020 | #4e2583 |
1 | air-force | Air Force | Air Force Falcons | MWC | USAF Academy, Colorado | 1958 | 2020 | #0061aa |
2 | akron | Akron | Akron Zips | MAC | Akron, Ohio | 1902 | 2020 | #a89968 |
3 | alabama-am | Alabama A&M | NaN | SWAC | Normal, Alabama | 2000 | 2020 | #661012 |
4 | alabama | Alabama | Alabama Crimson Tide | SEC | Tuscaloosa, Alabama | 1913 | 2020 | #aa1c37 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 357 entries, 0 to 356 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 team_id 357 non-null object 1 display_name 357 non-null object 2 flair_name 295 non-null object 3 conference 357 non-null object 4 location 357 non-null object 5 year_start 357 non-null int64 6 year_end 357 non-null int64 7 color 354 non-null object dtypes: int64(2), object(6) memory usage: 22.4+ KB
sql.df_to_sql(df, 'teams', if_exists='replace')
df_ap = pd.read_csv('../data/ap_rankings.csv')
# they are read in as strings, so convert to lists
for i in range(1,26):
df_ap['r'+str(i)] = df_ap['r'+str(i)].apply(eval)
df_ap.date = df_ap.date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))
print(type(df_ap.r1[0]))
df_ap.date.iloc[160:170]
<class 'list'>
160 2018-12-17 161 2018-12-24 162 2018-12-31 163 2019-01-07 164 2019-01-14 165 2019-01-21 166 2019-01-28 167 2019-02-04 168 2019-02-11 169 2019-02-18 Name: date, dtype: object
df_ap.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 194 entries, 0 to 193 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 194 non-null object 1 r1 194 non-null object 2 r2 194 non-null object 3 r3 194 non-null object 4 r4 194 non-null object 5 r5 194 non-null object 6 r6 194 non-null object 7 r7 194 non-null object 8 r8 194 non-null object 9 r9 194 non-null object 10 r10 194 non-null object 11 r11 194 non-null object 12 r12 194 non-null object 13 r13 194 non-null object 14 r14 194 non-null object 15 r15 194 non-null object 16 r16 194 non-null object 17 r17 194 non-null object 18 r18 194 non-null object 19 r19 194 non-null object 20 r20 194 non-null object 21 r21 194 non-null object 22 r22 194 non-null object 23 r23 194 non-null object 24 r24 194 non-null object 25 r25 194 non-null object dtypes: object(26) memory usage: 39.5+ KB
sql.df_to_sql(df_ap, 'ap_rankings', if_exists='replace')
def get_gid(row):
tid1, tid2 = sorted((row.WTeamID, row.LTeamID))
return '{0}_{1}_{2}'.format(row.Date, tid1, tid2)
def get_ap_ranks(row):
date = row.Date
wid, lid = row.WTeamID, row.LTeamID
idx = np.argmax(date < df_ap.date) - 1
r1, r2 = -1, -1
ranks = df_ap.iloc[idx].values[1:].tolist()
for irank,ts in enumerate(ranks):
if wid in ts:
r1 = irank+1
if lid in ts:
r2 = irank+1
return r1,r2
dfs = []
for fname in glob.glob("../data/game_data/*.csv"):
dfs.append(pd.read_csv(fname))
df = pd.concat(dfs)
df.Date = df.Date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))
df = df.sort_values("Date").reset_index(drop=True)
df["poss"] = 0.5*(df["WFGA"] + 0.44*df["WFTA"] - df["WOR"] + df["WTO"] + df["LFGA"] + 0.44*df["LFTA"] - df["LOR"] + df["LTO"])
df.insert(2, "gid", df.apply(get_gid, axis=1))
apranks = df.apply(get_ap_ranks, axis=1).values
df.insert(8, "Wrank", [x[0] for x in apranks])
df.insert(9, "Lrank", [x[1] for x in apranks])
print("Shape:",df.shape)
df.head(10)
Shape: (54760, 39)
Season | Date | gid | Type | WTeamID | WScore | LTeamID | LScore | Wrank | Lrank | ... | LFTM | LFTA | LOR | LDR | LAst | LTO | LStl | LBlk | LPF | poss | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011 | 2010-11-08 | 2010-11-08_california-irvine_illinois | RG | illinois | 79 | california-irvine | 65 | 13 | -1 | ... | 14 | 22 | 11 | 26 | 12 | 14 | 7 | 1 | 21 | 74.90 |
1 | 2011 | 2010-11-08 | 2010-11-08_pittsburgh_rhode-island | RG | pittsburgh | 83 | rhode-island | 75 | 5 | -1 | ... | 13 | 16 | 9 | 20 | 16 | 16 | 7 | 6 | 25 | 75.62 |
2 | 2011 | 2010-11-08 | 2010-11-08_maryland_seattle | RG | maryland | 105 | seattle | 76 | -1 | -1 | ... | 17 | 22 | 13 | 16 | 11 | 27 | 18 | 1 | 31 | 98.36 |
3 | 2011 | 2010-11-08 | 2010-11-08_navy_texas | RG | texas | 83 | navy | 52 | -1 | -1 | ... | 10 | 18 | 5 | 21 | 8 | 16 | 5 | 5 | 24 | 80.44 |
4 | 2011 | 2010-11-10 | 2010-11-10_college-of-charleston_maryland | RG | maryland | 75 | college-of-charleston | 74 | -1 | -1 | ... | 16 | 19 | 10 | 25 | 12 | 22 | 8 | 8 | 19 | 77.64 |
5 | 2011 | 2010-11-10 | 2010-11-10_illinois-chicago_pittsburgh | RG | pittsburgh | 97 | illinois-chicago | 54 | 5 | -1 | ... | 7 | 9 | 5 | 15 | 9 | 11 | 0 | 3 | 18 | 67.88 |
6 | 2011 | 2010-11-10 | 2010-11-10_illinois_toledo | RG | illinois | 84 | toledo | 45 | 13 | -1 | ... | 10 | 19 | 3 | 16 | 9 | 23 | 6 | 0 | 14 | 72.00 |
7 | 2011 | 2010-11-10 | 2010-11-10_louisiana-tech_texas | RG | texas | 89 | louisiana-tech | 58 | -1 | -1 | ... | 13 | 20 | 13 | 25 | 4 | 22 | 6 | 5 | 21 | 83.74 |
8 | 2011 | 2010-11-12 | 2010-11-12_maryland-eastern-shore_tulane | RG | tulane | 91 | maryland-eastern-shore | 62 | -1 | -1 | ... | 19 | 26 | 9 | 16 | 11 | 21 | 4 | 2 | 28 | 78.04 |
9 | 2011 | 2010-11-12 | 2010-11-12_marquette_prairie-view | RG | marquette | 97 | prairie-view | 58 | -1 | -1 | ... | 7 | 11 | 3 | 13 | 12 | 20 | 11 | 3 | 27 | 79.12 |
10 rows × 39 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 54760 entries, 0 to 54759 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Season 54760 non-null int64 1 Date 54760 non-null object 2 gid 54760 non-null object 3 Type 54760 non-null object 4 WTeamID 54760 non-null object 5 WScore 54760 non-null int64 6 LTeamID 54760 non-null object 7 LScore 54760 non-null int64 8 Wrank 54760 non-null int64 9 Lrank 54760 non-null int64 10 WLoc 54760 non-null object 11 NumOT 54760 non-null int64 12 WFGM 54760 non-null int64 13 WFGA 54760 non-null int64 14 WFGM3 54760 non-null int64 15 WFGA3 54760 non-null int64 16 WFTM 54760 non-null int64 17 WFTA 54760 non-null int64 18 WOR 54760 non-null int64 19 WDR 54760 non-null int64 20 WAst 54760 non-null int64 21 WTO 54760 non-null int64 22 WStl 54760 non-null int64 23 WBlk 54760 non-null int64 24 WPF 54760 non-null int64 25 LFGM 54760 non-null int64 26 LFGA 54760 non-null int64 27 LFGM3 54760 non-null int64 28 LFGA3 54760 non-null int64 29 LFTM 54760 non-null int64 30 LFTA 54760 non-null int64 31 LOR 54760 non-null int64 32 LDR 54760 non-null int64 33 LAst 54760 non-null int64 34 LTO 54760 non-null int64 35 LStl 54760 non-null int64 36 LBlk 54760 non-null int64 37 LPF 54760 non-null int64 38 poss 54760 non-null float64 dtypes: float64(1), int64(32), object(6) memory usage: 16.3+ MB
sql.df_to_sql(df, 'game_data', if_exists='replace')
df = pd.read_pickle('../data/gamethreads/aggregated_cleaned_2017-2020.pkl.gz', compression='gzip')
df.head()
date | timestamp | id | gid | is_postgame | ups | num_comments | title | |
---|---|---|---|---|---|---|---|---|
157 | 2017-11-10 | 2017-11-11 08:58:11 | 7c718s | 2017-11-10_alabama-birmingham_jacksonville | True | 9 | 0 | [Post Game Thread] UAB defeats Jacksonville, 9... |
51 | 2017-11-10 | 2017-11-11 01:40:04 | 7c4wus | 2017-11-10_alabama_memphis | False | 6 | 81 | [Game Thread] Memphis @ Alabama (6:30 PM ET) |
56 | 2017-11-10 | 2017-11-11 01:55:53 | 7c4zzd | 2017-11-10_albany-ny_iona | False | 7 | 8 | [Game Thread] Iona @ Albany (7:00 EST) |
65 | 2017-11-10 | 2017-11-11 02:07:12 | 7c528t | 2017-11-10_arizona_northern-arizona | False | 13 | 82 | [Game Thread] Northern Arizona @ #2 Arizona (8... |
108 | 2017-11-10 | 2017-11-11 04:45:38 | 7c5x2w | 2017-11-10_austin-peay_vanderbilt | False | 4 | 2 | [Game Thread] Austin Peay @ Vanderbilt (8:30 P... |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 14387 entries, 157 to 19707 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 14387 non-null object 1 timestamp 14387 non-null datetime64[ns] 2 id 14387 non-null object 3 gid 14387 non-null object 4 is_postgame 14387 non-null bool 5 ups 14387 non-null int64 6 num_comments 14387 non-null int64 7 title 14387 non-null object dtypes: bool(1), datetime64[ns](1), int64(2), object(4) memory usage: 913.2+ KB
sql.df_to_sql(df, 'gamethreads', if_exists='replace')
dfs = []
for f in glob.glob('../data/gamethread_comments/*/*.pkl.gz'):
dfs.append(pd.read_pickle(f, compression='gzip'))
dfc = pd.concat(dfs)
dfc = dfc.merge(df[['id','is_postgame','gid']], left_on='post_id', right_on='id')
dfc.drop('id', axis=1, inplace=True)
dfc.author = dfc.author.apply(lambda x: None if x is None else x.name)
dfc.head()
post_id | comment_id | author | author_flair | text | is_postgame | gid | |
---|---|---|---|---|---|---|---|
0 | dsh9ec | f6plj2x | CaffeinationGoat | [Connecticut Huskies, Binghamton Bearcats] | Ah well, the annual NYS private/public ivy sho... | True | 2019-11-06_binghamton_cornell |
1 | dsh9ec | f6pa84d | cheesoidhateself | [Cornell Big Red] | Consolidation of land! | True | 2019-11-06_binghamton_cornell |
2 | dsh9ec | f6qn8tq | PAPA_JOHNS_ZIMBABWE | [NJIT Highlanders] | SUNY Ithaca is bringing on the pain | True | 2019-11-06_binghamton_cornell |
3 | dsh9ec | f6pbzq4 | IsYouWitItYaBish | [Wisconsin Badgers] | Wow what time did this game tip off? | True | 2019-11-06_binghamton_cornell |
4 | dso2yc | f6qrvlq | mrfixit420 | [Wake Forest Demon Deacons] | I can’t believe how dominant BC is. There are ... | False | 2019-11-06_boston-college_wake-forest |
dfc.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 387257 entries, 0 to 387256 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 post_id 387257 non-null object 1 comment_id 387257 non-null object 2 author 368148 non-null object 3 author_flair 387257 non-null object 4 text 387257 non-null object 5 is_postgame 387257 non-null bool 6 gid 387257 non-null object dtypes: bool(1), object(6) memory usage: 21.1+ MB
sql.df_to_sql(dfc, 'gamethread_comments', if_exists='replace')
df = pd.read_csv('../data/preseason_predictions.csv')
df.head()
year | team_id | final_eff | yearm1_eff | yearm2_eff | wshares_lost | wshares_return | wshares_transfer | espn_recruit_rank | rsci_recruit_rank | rsci_recruit_points | yearm1_oeff | yearm1_deff | yearm1_pace | final_pace | recruit_score | pred_eff | pred_pace | pred_oeff | pred_deff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013 | air-force | 7.076 | -0.511 | 5.814 | 1.7 | 12.0 | 0.0 | NaN | NaN | 0 | 98.320 | 98.831 | 62.424 | 64.534 | 0 | 4.862 | 65.030 | 101.000 | 96.137 |
1 | 2013 | akron | 11.615 | 11.359 | 3.769 | 6.0 | 15.8 | 0.0 | NaN | NaN | 0 | 105.445 | 94.086 | 68.663 | 67.137 | 0 | 9.664 | 68.779 | 104.549 | 94.885 |
2 | 2013 | alabama | 13.221 | 18.570 | 15.892 | 7.8 | 14.9 | 0.0 | NaN | 25.0 | 79 | 104.865 | 86.295 | 63.449 | 62.245 | 79 | 18.518 | 65.646 | 104.837 | 86.319 |
3 | 2013 | alabama-am | -24.749 | -24.830 | -16.969 | 3.2 | 4.4 | 0.0 | NaN | NaN | 0 | 87.082 | 111.912 | 68.426 | 66.737 | 0 | -20.857 | 68.636 | 88.820 | 109.677 |
4 | 2013 | alabama-birmingham | -0.204 | 4.770 | 13.027 | 7.0 | 7.1 | 0.0 | NaN | NaN | 0 | 98.949 | 94.178 | 62.799 | 71.980 | 0 | 5.359 | 65.256 | 99.250 | 93.891 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2792 entries, 0 to 2791 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 2792 non-null int64 1 team_id 2792 non-null object 2 final_eff 2792 non-null float64 3 yearm1_eff 2792 non-null float64 4 yearm2_eff 2792 non-null float64 5 wshares_lost 2792 non-null float64 6 wshares_return 2792 non-null float64 7 wshares_transfer 2792 non-null float64 8 espn_recruit_rank 288 non-null float64 9 rsci_recruit_rank 404 non-null float64 10 rsci_recruit_points 2792 non-null int64 11 yearm1_oeff 2792 non-null float64 12 yearm1_deff 2792 non-null float64 13 yearm1_pace 2792 non-null float64 14 final_pace 2792 non-null float64 15 recruit_score 2792 non-null int64 16 pred_eff 2792 non-null float64 17 pred_pace 2792 non-null float64 18 pred_oeff 2792 non-null float64 19 pred_deff 2792 non-null float64 dtypes: float64(16), int64(3), object(1) memory usage: 436.4+ KB
sql.df_to_sql(df, 'preseason_predictions', if_exists='replace')