from fastai.gen_doc.nbdoc import * from fastai.tabular import * path = untar_data(URLs.ADULT_SAMPLE) df = pd.read_csv(path/'adult.csv') train_df, valid_df = df.iloc[:800].copy(), df.iloc[800:1000].copy() train_df.head() cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] cont_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] show_doc(TabularProc) show_doc(TabularProc.__call__) show_doc(TabularProc.apply_train) show_doc(TabularProc.apply_test) jekyll_important("Those two functions must be implemented in a subclass. `apply_test` defaults to `apply_train`.") show_doc(Categorify) show_doc(Categorify.apply_train) show_doc(Categorify.apply_test) tfm = Categorify(cat_names, cont_names) tfm(train_df) tfm(valid_df, test=True) train_df['workclass'].cat.categories valid_df['workclass'].cat.categories show_doc(FillMissing) show_doc(FillMissing.apply_train) show_doc(FillMissing.apply_test) train_df[cont_names].head() tfm = FillMissing(cat_names, cont_names) tfm(train_df) tfm(valid_df, test=True) train_df[cont_names].head() valid_df[cont_names].head() show_doc(FillStrategy, alt_doc_string='Enum flag represents determines how `FillMissing` should handle missing/nan values', arg_comments={ 'MEDIAN':'nans are replaced by the median value of the column', 'COMMON': 'nans are replaced by the most common value of the column', 'CONSTANT': 'nans are replaced by `fill_val`' }) show_doc(Normalize) norm = Normalize(cat_names, cont_names) show_doc(Normalize.apply_train) norm.apply_train(train_df) train_df[cont_names].head() show_doc(Normalize.apply_test) norm.apply_test(valid_df) valid_df[cont_names].head() show_doc(add_datepart) df = pd.DataFrame({'col1': ['02/03/2017', '02/04/2017', '02/05/2017'], 'col2': ['a', 'b', 'a']}) add_datepart(df, 'col1') # inplace df.head() show_doc(add_cyclic_datepart) df = pd.DataFrame({'col1': ['02/03/2017', '02/04/2017', '02/05/2017'], 'col2': ['a', 'b', 'a']}) df = add_cyclic_datepart(df, 'col1') # returns a dataframe df.head() show_doc(cont_cat_split) df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'a'], 'col3': [0.5, 1.2, 7.5], 'col4': ['ab', 'o', 'o']}) df cont_list, cat_list = cont_cat_split(df=df, max_card=20, dep_var='col4') cont_list, cat_list