#!/usr/bin/env python # coding: utf-8 # # Full deep learning model # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from fastai import * from fastai.tabular import * from fastai.text import * from fastai.metrics import accuracy # In[3]: valid_sz = 10000 PATH = Path('~/data/').expanduser() # In[5]: df = pd.read_feather(PATH/'listings-df') # # 1. Input models # ## 1.1. Tabular model # ### Data # In[6]: df_tab = df.drop('title', axis=1) # In[7]: cont_cols = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12' 'title_isnew_prob'] # real columns names were replaced cat_cols = sorted(list(set(df_tab.columns) - set(cont_cols) - {'condition'})) valid_idx = range(len(df)-valid_sz, len(df)) procs = [FillMissing, Categorify, Normalize] # In[8]: data_tab = (TabularList.from_df(df_tab, cat_cols, cont_cols, procs=procs, path=PATH) .split_by_idx(valid_idx) .label_from_df(cols='condition') .databunch()) # ### Model # In[9]: learn_tab = tabular_learner(data_tab, layers=[64], ps=[0.5], emb_drop=0.05, metrics=accuracy) learn_tab.load('tabular-model'); # In[10]: learn_tab.model.layers = learn_tab.model.layers[:-3] learn_tab.model # ## 1.2. NLP Model # ### Data # In[11]: df_nlp = df[['title', 'condition']] # In[12]: vocab = pickle.load(open(PATH/'itos', 'rb')) data_nlp = TextClasDataBunch.from_df(PATH, df_nlp[:-valid_sz], df_nlp[-valid_sz:], tokenizer=Tokenizer(lang='es'), vocab=vocab, text_cols='title', label_cols='condition') # ### Model # In[13]: learn_nlp = text_classifier_learner(data_nlp, drop_mult=0.5) learn_nlp.load('nlp-final'); # In[14]: learn_nlp.model[-1].layers = learn_nlp.model[-1].layers[:-3] learn_nlp.model # # 2. Concat model # ### Data # In[15]: class ConcatDataset(Dataset): def __init__(self, x1, x2, y): self.x1,self.x2,self.y = x1,x2,y def __len__(self): return len(self.y) def __getitem__(self, i): return (self.x1[i], self.x2[i]), self.y[i] train_ds = ConcatDataset(data_tab.train_ds.x, data_nlp.train_ds.x, data_tab.train_ds.y) valid_ds = ConcatDataset(data_tab.valid_ds.x, data_nlp.valid_ds.x, data_tab.valid_ds.y) # In[16]: def my_collate(batch): x,y = list(zip(*batch)) x1,x2 = list(zip(*x)) x1 = to_data(x1) x1 = list(zip(*x1)) x1 = torch.stack(x1[0]), torch.stack(x1[1]) x2, y = pad_collate(list(zip(x2, y)), pad_idx=1, pad_first=True) return (x1, x2), y # In[17]: bs = 64 train_sampler = SortishSampler(data_nlp.train_ds.x, key=lambda t: len(data_nlp.train_ds[t][0].data), bs=bs//2) valid_sampler = SortSampler(data_nlp.valid_ds.x, key=lambda t: len(data_nlp.valid_ds[t][0].data)) # In[18]: train_dl = DataLoader(train_ds, bs//2, sampler=train_sampler) valid_dl = DataLoader(valid_ds, bs, sampler=valid_sampler) data = DataBunch(train_dl, valid_dl, device=defaults.device, collate_fn=my_collate, path=PATH) # In[19]: (x1,x2),y = next(iter(data.train_dl)) print(f'Shape tabular batch (cats/cont): {x1[0].shape} / {x1[1].shape}') print(f'Shape nlp batch: {x2.shape}') print(f'Shape dependent var: {y.shape}') # ### Model # In[20]: class ConcatModel(nn.Module): def __init__(self, mod_tab, mod_nlp, layers, drops): super().__init__() self.mod_tab = mod_tab self.mod_nlp = mod_nlp lst_layers = [] activs = [nn.ReLU(inplace=True),] * (len(layers)-2) + [None] for n_in,n_out,p,actn in zip(layers[:-1], layers[1:], drops, activs): lst_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn) self.layers = nn.Sequential(*lst_layers) def forward(self, *x): x_tab = self.mod_tab(*x[0]) x_nlp = self.mod_nlp(x[1])[0] x = torch.cat([x_tab, x_nlp], dim=1) return self.layers(x) # In[21]: lin_layers = [64+50, 2] ps = [0.8] model = ConcatModel(learn_tab.model, learn_nlp.model, lin_layers, ps) model # ### Learner # In[22]: loss_func = nn.CrossEntropyLoss() layer_groups = [nn.Sequential(*flatten_model(learn_nlp.layer_groups[0])), nn.Sequential(*flatten_model(learn_nlp.layer_groups[1])), nn.Sequential(*flatten_model(learn_nlp.layer_groups[2])), nn.Sequential(*flatten_model(learn_nlp.layer_groups[3])), nn.Sequential(*(flatten_model(learn_nlp.layer_groups[4]) + flatten_model(model.mod_tab) + flatten_model(model.layers)))] learn = Learner(data, model, loss_func=loss_func, metrics=accuracy, layer_groups=layer_groups) # ### Train! # In[23]: learn.freeze() learn.lr_find() # In[24]: learn.recorder.plot() # In[25]: learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7)) # In[26]: learn.freeze_to(-2) learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3), moms=(0.8, 0.7)) # In[27]: learn.freeze_to(-3) learn.fit_one_cycle(1, slice(2e-3/(2.6**4), 2e-3), moms=(0.8, 0.7)) # In[28]: learn.unfreeze() learn.fit_one_cycle(5, slice(5e-4/(2.6**4), 5e-4), moms=(0.8, 0.7)) # In[29]: learn.fit_one_cycle(5, slice(5e-4/(2.6**4), 5e-4), moms=(0.8, 0.7), wd=1e-1) # In[ ]: