Convert CSV/DataFrames to PostgreSQL tables¶

We use a single database "cbb" to store various tables

In [1]:

import os
import glob
import datetime as dt

import numpy as np
import pandas as pd

from watchcbb.sql import SQLEngine

In [2]:

sql = SQLEngine('cbb')

Connect to 'cbb' database¶

Start with simple teams.csv¶

In [3]:

df = pd.read_csv('../data/teams.csv')
df.head()

Out[3]:

	team_id	display_name	flair_name	conference	location	year_start	year_end	color
0	abilene-christian	Abilene Christian	Abilene Christian Wildcats	Southland	Abilene, Texas	1971	2020	#4e2583
1	air-force	Air Force	Air Force Falcons	MWC	USAF Academy, Colorado	1958	2020	#0061aa
2	akron	Akron	Akron Zips	MAC	Akron, Ohio	1902	2020	#a89968
3	alabama-am	Alabama A&M	NaN	SWAC	Normal, Alabama	2000	2020	#661012
4	alabama	Alabama	Alabama Crimson Tide	SEC	Tuscaloosa, Alabama	1913	2020	#aa1c37

In [4]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   team_id       357 non-null    object
 1   display_name  357 non-null    object
 2   flair_name    295 non-null    object
 3   conference    357 non-null    object
 4   location      357 non-null    object
 5   year_start    357 non-null    int64 
 6   year_end      357 non-null    int64 
 7   color         354 non-null    object
dtypes: int64(2), object(6)
memory usage: 22.4+ KB

In [5]:

sql.df_to_sql(df, 'teams', if_exists='replace')

AP Rankings¶

In [6]:

df_ap = pd.read_csv('../data/ap_rankings.csv')
# they are read in as strings, so convert to lists
for i in range(1,26):
    df_ap['r'+str(i)] = df_ap['r'+str(i)].apply(eval)
df_ap.date = df_ap.date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))
print(type(df_ap.r1[0]))
df_ap.date.iloc[160:170]

<class 'list'>

Out[6]:

160    2018-12-17
161    2018-12-24
162    2018-12-31
163    2019-01-07
164    2019-01-14
165    2019-01-21
166    2019-01-28
167    2019-02-04
168    2019-02-11
169    2019-02-18
Name: date, dtype: object

In [7]:

df_ap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    194 non-null    object
 1   r1      194 non-null    object
 2   r2      194 non-null    object
 3   r3      194 non-null    object
 4   r4      194 non-null    object
 5   r5      194 non-null    object
 6   r6      194 non-null    object
 7   r7      194 non-null    object
 8   r8      194 non-null    object
 9   r9      194 non-null    object
 10  r10     194 non-null    object
 11  r11     194 non-null    object
 12  r12     194 non-null    object
 13  r13     194 non-null    object
 14  r14     194 non-null    object
 15  r15     194 non-null    object
 16  r16     194 non-null    object
 17  r17     194 non-null    object
 18  r18     194 non-null    object
 19  r19     194 non-null    object
 20  r20     194 non-null    object
 21  r21     194 non-null    object
 22  r22     194 non-null    object
 23  r23     194 non-null    object
 24  r24     194 non-null    object
 25  r25     194 non-null    object
dtypes: object(26)
memory usage: 39.5+ KB

In [8]:

sql.df_to_sql(df_ap, 'ap_rankings', if_exists='replace')

Game-by-game data¶

In [9]:

def get_gid(row):
    tid1, tid2 = sorted((row.WTeamID, row.LTeamID))
    return '{0}_{1}_{2}'.format(row.Date, tid1, tid2)

def get_ap_ranks(row):
    date = row.Date
    wid, lid = row.WTeamID, row.LTeamID
    idx = np.argmax(date < df_ap.date) - 1
    r1, r2 = -1, -1
    ranks = df_ap.iloc[idx].values[1:].tolist()
    for irank,ts in enumerate(ranks):
        if wid in ts:
            r1 = irank+1
        if lid in ts:
            r2 = irank+1
    return r1,r2

dfs = []
for fname in glob.glob("../data/game_data/*.csv"):
    dfs.append(pd.read_csv(fname))
df = pd.concat(dfs)
df.Date = df.Date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))
df = df.sort_values("Date").reset_index(drop=True)
df["poss"] = 0.5*(df["WFGA"] + 0.44*df["WFTA"] - df["WOR"] + df["WTO"] + df["LFGA"] + 0.44*df["LFTA"] - df["LOR"] + df["LTO"])
df.insert(2, "gid", df.apply(get_gid, axis=1))
apranks = df.apply(get_ap_ranks, axis=1).values
df.insert(8, "Wrank", [x[0] for x in apranks])
df.insert(9, "Lrank", [x[1] for x in apranks])
print("Shape:",df.shape)
df.head(10)

Shape: (54760, 39)

Out[9]:

	Season	Date	gid	Type	WTeamID	WScore	LTeamID	LScore	Wrank	Lrank	...	LFTM	LFTA	LOR	LDR	LAst	LTO	LStl	LBlk	LPF	poss
0	2011	2010-11-08	2010-11-08_california-irvine_illinois	RG	illinois	79	california-irvine	65	13	-1	...	14	22	11	26	12	14	7	1	21	74.90
1	2011	2010-11-08	2010-11-08_pittsburgh_rhode-island	RG	pittsburgh	83	rhode-island	75	5	-1	...	13	16	9	20	16	16	7	6	25	75.62
2	2011	2010-11-08	2010-11-08_maryland_seattle	RG	maryland	105	seattle	76	-1	-1	...	17	22	13	16	11	27	18	1	31	98.36
3	2011	2010-11-08	2010-11-08_navy_texas	RG	texas	83	navy	52	-1	-1	...	10	18	5	21	8	16	5	5	24	80.44
4	2011	2010-11-10	2010-11-10_college-of-charleston_maryland	RG	maryland	75	college-of-charleston	74	-1	-1	...	16	19	10	25	12	22	8	8	19	77.64
5	2011	2010-11-10	2010-11-10_illinois-chicago_pittsburgh	RG	pittsburgh	97	illinois-chicago	54	5	-1	...	7	9	5	15	9	11	0	3	18	67.88
6	2011	2010-11-10	2010-11-10_illinois_toledo	RG	illinois	84	toledo	45	13	-1	...	10	19	3	16	9	23	6	0	14	72.00
7	2011	2010-11-10	2010-11-10_louisiana-tech_texas	RG	texas	89	louisiana-tech	58	-1	-1	...	13	20	13	25	4	22	6	5	21	83.74
8	2011	2010-11-12	2010-11-12_maryland-eastern-shore_tulane	RG	tulane	91	maryland-eastern-shore	62	-1	-1	...	19	26	9	16	11	21	4	2	28	78.04
9	2011	2010-11-12	2010-11-12_marquette_prairie-view	RG	marquette	97	prairie-view	58	-1	-1	...	7	11	3	13	12	20	11	3	27	79.12

10 rows × 39 columns

In [10]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54760 entries, 0 to 54759
Data columns (total 39 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   54760 non-null  int64  
 1   Date     54760 non-null  object 
 2   gid      54760 non-null  object 
 3   Type     54760 non-null  object 
 4   WTeamID  54760 non-null  object 
 5   WScore   54760 non-null  int64  
 6   LTeamID  54760 non-null  object 
 7   LScore   54760 non-null  int64  
 8   Wrank    54760 non-null  int64  
 9   Lrank    54760 non-null  int64  
 10  WLoc     54760 non-null  object 
 11  NumOT    54760 non-null  int64  
 12  WFGM     54760 non-null  int64  
 13  WFGA     54760 non-null  int64  
 14  WFGM3    54760 non-null  int64  
 15  WFGA3    54760 non-null  int64  
 16  WFTM     54760 non-null  int64  
 17  WFTA     54760 non-null  int64  
 18  WOR      54760 non-null  int64  
 19  WDR      54760 non-null  int64  
 20  WAst     54760 non-null  int64  
 21  WTO      54760 non-null  int64  
 22  WStl     54760 non-null  int64  
 23  WBlk     54760 non-null  int64  
 24  WPF      54760 non-null  int64  
 25  LFGM     54760 non-null  int64  
 26  LFGA     54760 non-null  int64  
 27  LFGM3    54760 non-null  int64  
 28  LFGA3    54760 non-null  int64  
 29  LFTM     54760 non-null  int64  
 30  LFTA     54760 non-null  int64  
 31  LOR      54760 non-null  int64  
 32  LDR      54760 non-null  int64  
 33  LAst     54760 non-null  int64  
 34  LTO      54760 non-null  int64  
 35  LStl     54760 non-null  int64  
 36  LBlk     54760 non-null  int64  
 37  LPF      54760 non-null  int64  
 38  poss     54760 non-null  float64
dtypes: float64(1), int64(32), object(6)
memory usage: 16.3+ MB

In [11]:

sql.df_to_sql(df, 'game_data', if_exists='replace')

Reddit submissions¶

In [12]:

df = pd.read_pickle('../data/gamethreads/aggregated_cleaned_2017-2020.pkl.gz', compression='gzip')
df.head()

Out[12]:

	date	timestamp	id	gid	is_postgame	ups	num_comments	title
157	2017-11-10	2017-11-11 08:58:11	7c718s	2017-11-10_alabama-birmingham_jacksonville	True	9	0	[Post Game Thread] UAB defeats Jacksonville, 9...
51	2017-11-10	2017-11-11 01:40:04	7c4wus	2017-11-10_alabama_memphis	False	6	81	[Game Thread] Memphis @ Alabama (6:30 PM ET)
56	2017-11-10	2017-11-11 01:55:53	7c4zzd	2017-11-10_albany-ny_iona	False	7	8	[Game Thread] Iona @ Albany (7:00 EST)
65	2017-11-10	2017-11-11 02:07:12	7c528t	2017-11-10_arizona_northern-arizona	False	13	82	[Game Thread] Northern Arizona @ #2 Arizona (8...
108	2017-11-10	2017-11-11 04:45:38	7c5x2w	2017-11-10_austin-peay_vanderbilt	False	4	2	[Game Thread] Austin Peay @ Vanderbilt (8:30 P...

In [13]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14387 entries, 157 to 19707
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          14387 non-null  object        
 1   timestamp     14387 non-null  datetime64[ns]
 2   id            14387 non-null  object        
 3   gid           14387 non-null  object        
 4   is_postgame   14387 non-null  bool          
 5   ups           14387 non-null  int64         
 6   num_comments  14387 non-null  int64         
 7   title         14387 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(2), object(4)
memory usage: 913.2+ KB

In [14]:

sql.df_to_sql(df, 'gamethreads', if_exists='replace')

Reddit gamethread comments¶

In [15]:

dfs = []
for f in glob.glob('../data/gamethread_comments/*/*.pkl.gz'):
    dfs.append(pd.read_pickle(f, compression='gzip'))
dfc = pd.concat(dfs)
dfc = dfc.merge(df[['id','is_postgame','gid']], left_on='post_id', right_on='id')
dfc.drop('id', axis=1, inplace=True)
dfc.author = dfc.author.apply(lambda x: None if x is None else x.name)
dfc.head()

Out[15]:

	post_id	comment_id	author	author_flair	text	is_postgame	gid
0	dsh9ec	f6plj2x	CaffeinationGoat	[Connecticut Huskies, Binghamton Bearcats]	Ah well, the annual NYS private/public ivy sho...	True	2019-11-06_binghamton_cornell
1	dsh9ec	f6pa84d	cheesoidhateself	[Cornell Big Red]	Consolidation of land!	True	2019-11-06_binghamton_cornell
2	dsh9ec	f6qn8tq	PAPA_JOHNS_ZIMBABWE	[NJIT Highlanders]	SUNY Ithaca is bringing on the pain	True	2019-11-06_binghamton_cornell
3	dsh9ec	f6pbzq4	IsYouWitItYaBish	[Wisconsin Badgers]	Wow what time did this game tip off?	True	2019-11-06_binghamton_cornell
4	dso2yc	f6qrvlq	mrfixit420	[Wake Forest Demon Deacons]	I can’t believe how dominant BC is. There are ...	False	2019-11-06_boston-college_wake-forest

In [16]:

dfc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 387257 entries, 0 to 387256
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   post_id       387257 non-null  object
 1   comment_id    387257 non-null  object
 2   author        368148 non-null  object
 3   author_flair  387257 non-null  object
 4   text          387257 non-null  object
 5   is_postgame   387257 non-null  bool  
 6   gid           387257 non-null  object
dtypes: bool(1), object(6)
memory usage: 21.1+ MB

In [17]:

sql.df_to_sql(dfc, 'gamethread_comments', if_exists='replace')

Preseason predictions¶

In [18]:

df = pd.read_csv('../data/preseason_predictions.csv')
df.head()

Out[18]:

	year	team_id	final_eff	yearm1_eff	yearm2_eff	wshares_lost	wshares_return	espn_recruit_rank	rsci_recruit_rank	rsci_recruit_points	yearm1_oeff	yearm1_deff	yearm1_pace	final_pace	recruit_score	pred_eff	pred_pace	pred_oeff	pred_deff
0	2013	air-force	7.076	-0.511	5.814	1.7	12.0	NaN	NaN	0	98.320	98.831	62.424	64.534	0	4.862	65.030	101.000	96.137
1	2013	akron	11.615	11.359	3.769	6.0	15.8	NaN	NaN	0	105.445	94.086	68.663	67.137	0	9.664	68.779	104.549	94.885
2	2013	alabama	13.221	18.570	15.892	7.8	14.9	NaN	25.0	79	104.865	86.295	63.449	62.245	79	18.518	65.646	104.837	86.319
3	2013	alabama-am	-24.749	-24.830	-16.969	3.2	4.4	NaN	NaN	0	87.082	111.912	68.426	66.737	0	-20.857	68.636	88.820	109.677
4	2013	alabama-birmingham	-0.204	4.770	13.027	7.0	7.1	NaN	NaN	0	98.949	94.178	62.799	71.980	0	5.359	65.256	99.250	93.891

In [19]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2792 entries, 0 to 2791
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 2792 non-null   int64  
 1   team_id              2792 non-null   object 
 2   final_eff            2792 non-null   float64
 3   yearm1_eff           2792 non-null   float64
 4   yearm2_eff           2792 non-null   float64
 5   wshares_lost         2792 non-null   float64
 6   wshares_return       2792 non-null   float64
 7   wshares_transfer     2792 non-null   float64
 8   espn_recruit_rank    288 non-null    float64
 9   rsci_recruit_rank    404 non-null    float64
 10  rsci_recruit_points  2792 non-null   int64  
 11  yearm1_oeff          2792 non-null   float64
 12  yearm1_deff          2792 non-null   float64
 13  yearm1_pace          2792 non-null   float64
 14  final_pace           2792 non-null   float64
 15  recruit_score        2792 non-null   int64  
 16  pred_eff             2792 non-null   float64
 17  pred_pace            2792 non-null   float64
 18  pred_oeff            2792 non-null   float64
 19  pred_deff            2792 non-null   float64
dtypes: float64(16), int64(3), object(1)
memory usage: 436.4+ KB

In [20]:

sql.df_to_sql(df, 'preseason_predictions', if_exists='replace')