%reload_ext autoreload
%autoreload 2
from fastai import *
from fastai.tabular import *
from fastai.text import *
from fastai.metrics import accuracy
valid_sz = 10000
PATH = Path('~/data/').expanduser()
df = pd.read_feather(PATH/'listings-df')
df_tab = df.drop('title', axis=1)
cont_cols = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11', 'col12'
'title_isnew_prob'] # real columns names were replaced
cat_cols = sorted(list(set(df_tab.columns) - set(cont_cols) - {'condition'}))
valid_idx = range(len(df)-valid_sz, len(df))
procs = [FillMissing, Categorify, Normalize]
data_tab = (TabularList.from_df(df_tab, cat_cols, cont_cols, procs=procs, path=PATH)
.split_by_idx(valid_idx)
.label_from_df(cols='condition')
.databunch())
learn_tab = tabular_learner(data_tab, layers=[64], ps=[0.5], emb_drop=0.05, metrics=accuracy)
learn_tab.load('tabular-model');
learn_tab.model.layers = learn_tab.model.layers[:-3]
learn_tab.model
TabularModel( (embeds): ModuleList( (0): Embedding(4, 3) (1): Embedding(10492, 50) (2): Embedding(3, 2) (3): Embedding(8, 5) (4): Embedding(1461, 50) (5): Embedding(286, 50) (6): Embedding(3481, 50) (7): Embedding(304, 50) (8): Embedding(570, 50) (9): Embedding(30, 16) (10): Embedding(26, 14) (11): Embedding(300, 50) (12): Embedding(33283, 50) (13): Embedding(5, 3) (14): Embedding(5, 3) (15): Embedding(3, 2) (16): Embedding(3, 2) ) (emb_drop): Dropout(p=0.05) (bn_cont): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (layers): Sequential( (0): Linear(in_features=462, out_features=64, bias=True) (1): ReLU(inplace) ) )
df_nlp = df[['title', 'condition']]
vocab = pickle.load(open(PATH/'itos', 'rb'))
data_nlp = TextClasDataBunch.from_df(PATH, df_nlp[:-valid_sz], df_nlp[-valid_sz:],
tokenizer=Tokenizer(lang='es'),
vocab=vocab, text_cols='title', label_cols='condition')
learn_nlp = text_classifier_learner(data_nlp, drop_mult=0.5)
learn_nlp.load('nlp-final');
learn_nlp.model[-1].layers = learn_nlp.model[-1].layers[:-3]
learn_nlp.model
SequentialRNN( (0): MultiBatchRNNCore( (encoder): Embedding(22847, 400, padding_idx=1) (encoder_dp): EmbeddingDropout( (emb): Embedding(22847, 400, padding_idx=1) ) (rnns): ModuleList( (0): WeightDropout( (module): LSTM(400, 1150) ) (1): WeightDropout( (module): LSTM(1150, 1150) ) (2): WeightDropout( (module): LSTM(1150, 400) ) ) (input_dp): RNNDropout() (hidden_dps): ModuleList( (0): RNNDropout() (1): RNNDropout() (2): RNNDropout() ) ) (1): PoolingLinearClassifier( (layers): Sequential( (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (1): Dropout(p=0.2) (2): Linear(in_features=1200, out_features=50, bias=True) (3): ReLU(inplace) ) ) )
class ConcatDataset(Dataset):
def __init__(self, x1, x2, y): self.x1,self.x2,self.y = x1,x2,y
def __len__(self): return len(self.y)
def __getitem__(self, i): return (self.x1[i], self.x2[i]), self.y[i]
train_ds = ConcatDataset(data_tab.train_ds.x, data_nlp.train_ds.x, data_tab.train_ds.y)
valid_ds = ConcatDataset(data_tab.valid_ds.x, data_nlp.valid_ds.x, data_tab.valid_ds.y)
def my_collate(batch):
x,y = list(zip(*batch))
x1,x2 = list(zip(*x))
x1 = to_data(x1)
x1 = list(zip(*x1))
x1 = torch.stack(x1[0]), torch.stack(x1[1])
x2, y = pad_collate(list(zip(x2, y)), pad_idx=1, pad_first=True)
return (x1, x2), y
bs = 64
train_sampler = SortishSampler(data_nlp.train_ds.x, key=lambda t: len(data_nlp.train_ds[t][0].data), bs=bs//2)
valid_sampler = SortSampler(data_nlp.valid_ds.x, key=lambda t: len(data_nlp.valid_ds[t][0].data))
train_dl = DataLoader(train_ds, bs//2, sampler=train_sampler)
valid_dl = DataLoader(valid_ds, bs, sampler=valid_sampler)
data = DataBunch(train_dl, valid_dl, device=defaults.device, collate_fn=my_collate, path=PATH)
(x1,x2),y = next(iter(data.train_dl))
print(f'Shape tabular batch (cats/cont): {x1[0].shape} / {x1[1].shape}')
print(f'Shape nlp batch: {x2.shape}')
print(f'Shape dependent var: {y.shape}')
Shape tabular batch (cats/cont): torch.Size([32, 17]) / torch.Size([32, 12]) Shape nlp batch: torch.Size([42, 32]) Shape dependent var: torch.Size([32])
class ConcatModel(nn.Module):
def __init__(self, mod_tab, mod_nlp, layers, drops):
super().__init__()
self.mod_tab = mod_tab
self.mod_nlp = mod_nlp
lst_layers = []
activs = [nn.ReLU(inplace=True),] * (len(layers)-2) + [None]
for n_in,n_out,p,actn in zip(layers[:-1], layers[1:], drops, activs):
lst_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn)
self.layers = nn.Sequential(*lst_layers)
def forward(self, *x):
x_tab = self.mod_tab(*x[0])
x_nlp = self.mod_nlp(x[1])[0]
x = torch.cat([x_tab, x_nlp], dim=1)
return self.layers(x)
lin_layers = [64+50, 2]
ps = [0.8]
model = ConcatModel(learn_tab.model, learn_nlp.model, lin_layers, ps)
model
ConcatModel( (mod_tab): TabularModel( (embeds): ModuleList( (0): Embedding(4, 3) (1): Embedding(10492, 50) (2): Embedding(3, 2) (3): Embedding(8, 5) (4): Embedding(1461, 50) (5): Embedding(286, 50) (6): Embedding(3481, 50) (7): Embedding(304, 50) (8): Embedding(570, 50) (9): Embedding(30, 16) (10): Embedding(26, 14) (11): Embedding(300, 50) (12): Embedding(33283, 50) (13): Embedding(5, 3) (14): Embedding(5, 3) (15): Embedding(3, 2) (16): Embedding(3, 2) ) (emb_drop): Dropout(p=0.05) (bn_cont): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (layers): Sequential( (0): Linear(in_features=462, out_features=64, bias=True) (1): ReLU(inplace) ) ) (mod_nlp): SequentialRNN( (0): MultiBatchRNNCore( (encoder): Embedding(22847, 400, padding_idx=1) (encoder_dp): EmbeddingDropout( (emb): Embedding(22847, 400, padding_idx=1) ) (rnns): ModuleList( (0): WeightDropout( (module): LSTM(400, 1150) ) (1): WeightDropout( (module): LSTM(1150, 1150) ) (2): WeightDropout( (module): LSTM(1150, 400) ) ) (input_dp): RNNDropout() (hidden_dps): ModuleList( (0): RNNDropout() (1): RNNDropout() (2): RNNDropout() ) ) (1): PoolingLinearClassifier( (layers): Sequential( (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (1): Dropout(p=0.2) (2): Linear(in_features=1200, out_features=50, bias=True) (3): ReLU(inplace) ) ) ) (layers): Sequential( (0): BatchNorm1d(114, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (1): Dropout(p=0.8) (2): Linear(in_features=114, out_features=2, bias=True) ) )
loss_func = nn.CrossEntropyLoss()
layer_groups = [nn.Sequential(*flatten_model(learn_nlp.layer_groups[0])),
nn.Sequential(*flatten_model(learn_nlp.layer_groups[1])),
nn.Sequential(*flatten_model(learn_nlp.layer_groups[2])),
nn.Sequential(*flatten_model(learn_nlp.layer_groups[3])),
nn.Sequential(*(flatten_model(learn_nlp.layer_groups[4]) +
flatten_model(model.mod_tab) +
flatten_model(model.layers)))]
learn = Learner(data, model, loss_func=loss_func, metrics=accuracy, layer_groups=layer_groups)
learn.freeze()
learn.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learn.recorder.plot()
learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
Total time: 00:37 epoch train_loss valid_loss accuracy 1 0.106572 0.248390 0.920200 (00:37)
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3), moms=(0.8, 0.7))
Total time: 00:40 epoch train_loss valid_loss accuracy 1 0.086336 0.256554 0.919800 (00:40)
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(2e-3/(2.6**4), 2e-3), moms=(0.8, 0.7))
Total time: 01:03 epoch train_loss valid_loss accuracy 1 0.097170 0.257217 0.919500 (01:03)
learn.unfreeze()
learn.fit_one_cycle(5, slice(5e-4/(2.6**4), 5e-4), moms=(0.8, 0.7))
Total time: 07:04 epoch train_loss valid_loss accuracy 1 0.080045 0.260310 0.920200 (01:24) 2 0.075644 0.249944 0.922800 (01:26) 3 0.071381 0.271557 0.920900 (01:26) 4 0.078788 0.290130 0.919600 (01:24) 5 0.088786 0.268973 0.921800 (01:23)
learn.fit_one_cycle(5, slice(5e-4/(2.6**4), 5e-4), moms=(0.8, 0.7), wd=1e-1)
Total time: 06:51 epoch train_loss valid_loss accuracy 1 0.077082 0.248748 0.924100 (01:21) 2 0.081846 0.249953 0.923700 (01:20) 3 0.088959 0.254498 0.920200 (01:23) 4 0.056842 0.249644 0.922800 (01:21) 5 0.067153 0.244735 0.922900 (01:24)