from fastai.gen_doc.nbdoc import *
from fastai.tabular import *


path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
train_df, valid_df = df.iloc[:800].copy(), df.iloc[800:1000].copy()
train_df.head()

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cont_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

show_doc(TabularProc)

show_doc(TabularProc.__call__)

show_doc(TabularProc.apply_train)

show_doc(TabularProc.apply_test)

jekyll_important("Those two functions must be implemented in a subclass. `apply_test` defaults to `apply_train`.")

show_doc(Categorify)

show_doc(Categorify.apply_train)

show_doc(Categorify.apply_test)

tfm = Categorify(cat_names, cont_names)
tfm(train_df)
tfm(valid_df, test=True)

train_df['workclass'].cat.categories

valid_df['workclass'].cat.categories

show_doc(FillMissing)

show_doc(FillMissing.apply_train)

show_doc(FillMissing.apply_test)

train_df[cont_names].head()

tfm = FillMissing(cat_names, cont_names)
tfm(train_df)
tfm(valid_df, test=True)
train_df[cont_names].head()

valid_df[cont_names].head()

show_doc(FillStrategy, alt_doc_string='Enum flag represents determines how `FillMissing` should handle missing/nan values', arg_comments={
    'MEDIAN':'nans are replaced by the median value of the column',
    'COMMON': 'nans are replaced by the most common value of the column',
    'CONSTANT': 'nans are replaced by `fill_val`'
})

show_doc(Normalize)

norm = Normalize(cat_names, cont_names)

show_doc(Normalize.apply_train)

norm.apply_train(train_df)
train_df[cont_names].head()

show_doc(Normalize.apply_test)

norm.apply_test(valid_df)
valid_df[cont_names].head()

show_doc(add_datepart)

df = pd.DataFrame({'col1': ['02/03/2017', '02/04/2017', '02/05/2017'], 'col2': ['a', 'b', 'a']})
add_datepart(df, 'col1') # inplace
df.head()

show_doc(add_cyclic_datepart)

df = pd.DataFrame({'col1': ['02/03/2017', '02/04/2017', '02/05/2017'], 'col2': ['a', 'b', 'a']})
df = add_cyclic_datepart(df, 'col1') # returns a dataframe
df.head()

show_doc(cont_cat_split)

df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'a'], 'col3': [0.5, 1.2, 7.5], 'col4': ['ab', 'o', 'o']})
df

cont_list, cat_list = cont_cat_split(df=df, max_card=20, dep_var='col4')
cont_list, cat_list