from zipfile import ZipFile with ZipFile('bengaliai.zip', 'r') as zip_ref: zip_ref.extractall('bengaliai') height = 137 width = 236 size = 128 tn = 'train_image_data_' suf = '.parquet' train = [tn+str(i)+suf for i in range(4)] train out_train = 'train.zip' import cv2 import pandas as pd # Borrowed from iafoss' notebook def bbox(img): rows = np.any(img, axis=1) cols = np.any(img, axis=0) rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] return rmin, rmax, cmin, cmax def crop_resize(img0, size=size, pad=16): #crop a box around pixels large than the threshold #some images contain line at the sides ymin,ymax,xmin,xmax = bbox(img0[5:-5,5:-5] > 80) #cropping may cut too much, so we need to add it back xmin = xmin - 13 if (xmin > 13) else 0 ymin = ymin - 10 if (ymin > 10) else 0 xmax = xmax + 13 if (xmax < width - 13) else width ymax = ymax + 10 if (ymax < height - 10) else height img = img0[ymin:ymax,xmin:xmax] #remove lo intensity pixels as noise img[img < 28] = 0 lx, ly = xmax-xmin,ymax-ymin l = max(lx,ly) + pad #make sure that the aspect ratio is kept in rescaling img = np.pad(img, [((l-ly)//2,), ((l-lx)//2,)], mode='constant') return cv2.resize(img,(size,size)) df = pd.read_parquet('bengaliai/' + train[0]) df.head() import matplotlib.pyplot as plt import numpy as np n_imgs = 8 fig, axs = plt.subplots(n_imgs, 2, figsize=(10, 5*n_imgs)) for idx in range(n_imgs): #somehow the original input is inverted img0 = 255 - df.iloc[idx, 1:].values.reshape(height, width).astype(np.uint8) #normalize each image by its max val img = (img0*(255.0/img0.max())).astype(np.uint8) img = crop_resize(img) axs[idx,0].imshow(img0) axs[idx,0].set_title('Original image') axs[idx,0].axis('off') axs[idx,1].imshow(img) axs[idx,1].set_title('Crop & resize') axs[idx,1].axis('off') plt.show() from tqdm import tqdm x_tot,x2_tot = [],[] with ZipFile(out_train, 'w') as img_out: for fname in train: df = pd.read_parquet('bengaliai/'+fname) #the input is inverted data = 255 - df.iloc[:, 1:].values.reshape(-1, height, width).astype(np.uint8) for idx in tqdm(range(len(df))): name = df.iloc[idx,0] #normalize each image by its max val img = (data[idx]*(255.0/data[idx].max())).astype(np.uint8) img = crop_resize(img) x_tot.append((img/255.0).mean()) x2_tot.append(((img/255.0)**2).mean()) img = cv2.imencode('.png',img)[1] img_out.writestr(name + '.png', img) mean = np.array(x_tot).mean() std = np.sqrt(np.array(x2_tot).mean() - mean**2) print(f'mean: {mean}, std: {std}') with ZipFile('train.zip', 'r') as zip_ref: zip_ref.extractall('images') !pip install scikit-learn fastai fastdot from fastai.vision.all import * train = pd.read_csv('bengaliai/train.csv') test = pd.read_csv('bengaliai/test.csv') class_map = pd.read_csv('bengaliai/class_map.csv') train.head() graph_vocab = train['grapheme_root'].unique() vowel_vocab = train['vowel_diacritic'].unique() const_vocab = train['consonant_diacritic'].unique() blocks = (ImageBlock(cls=PILImageBW), CategoryBlock(vocab=graph_vocab), CategoryBlock(vocab=vowel_vocab), CategoryBlock(vocab=const_vocab)) getters = [ ColReader('image_id', pref='images/', suff='.png'), ColReader('grapheme_root'), ColReader('vowel_diacritic'), ColReader('consonant_diacritic') ] batch_tfms = [*aug_transforms(do_flip=False, size=128), Normalize.from_stats(mean=0.0692, std=0.2051)] bengel = DataBlock(blocks=blocks, getters = getters, splitter=RandomSplitter(), batch_tfms=batch_tfms, n_inp=1) bs=128 dls = bengel.dataloaders(train.sample(1000), bs=bs) dls.show_batch(max_n=1, figsize=(3,3)) n = train[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']].nunique(); print(n) dls.c body = create_body(resnet34, pretrained=True) l = nn.Conv2d(1, 64, kernel_size=(7,7), stride=(2,2), padding=(3,3), bias=False) l.weight = nn.Parameter(l.weight.sum(dim=1, keepdim=True)) body[0] = l from fastdot import * def _fillcolor(o:str): if 'Lin' in o: return 'lightblue' elif 'ReLU' in o: return 'gray' elif 'Flatten' in o: return 'white' elif 'Pooling' in o: return 'pink' elif 'Conv' in o: return 'white' else: return 'gold' node_defaults['fillcolor'] = _fillcolor graph = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 168)'] vow = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 11)'] const = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 7)'] body1 = ['Conv2d (3, 512, 512)'] out = ['G (bs, 168)\n V (bs, 11)\n C (bs, 7)'] block1, block2, block3, block4, block5 = ['ResNet Body', 'Grapheme Head', 'Vowel Head', 'Consonant Head', 'Model Output'] conns = ((block1, block2), (block1, block3), (block1, block4), (block4, block5), (block3, block5), (block2, block5), (graph[-3], graph[4]), (vow[-3], vow[4]), (const[-3], const[4])) visual = graph_items(seq_cluster(body1, block1), seq_cluster(graph, block2), seq_cluster(vow, block3), seq_cluster(const, block4), seq_cluster(out, block5)) visual.add_items(*object_connections(conns)) visual from fastai.vision.all import * class MultiModel(Module): "A three-headed model given a `body` and `n` output features" def __init__(self, body:nn.Sequential, n:L): nf = num_features_model(nn.Sequential(*body.children())) * (2) self.body = body self.grapheme = create_head(nf, n[0]) self.vowel = create_head(nf, n[1]) self.consonant = create_head(nf, n[2]) def forward(self, x): y = self.body(x) graph = self.grapheme(y) vowel = self.vowel(y) const = self.consonant(y) return [graph, vowel, const] net = MultiModel(body, dls.c) from sklearn.metrics import recall_score class CombinationLoss(Module): "Cross Entropy Loss on multiple targets" def __init__(self, func=F.cross_entropy, weights=[2, 1, 1]): self.func, self.w = func, weights def forward(self, xs, *ys, reduction='mean'): for i, w, x, y in zip(range(len(xs)), self.w, xs, ys): if i == 0: loss = w*self.func(x, y, reduction=reduction) else: loss += w*self.func(x, y, reduction=reduction) return loss class RecallPartial(Metric): "Stores predictions and targets on CPU in accumulate to perform final calculations with `func`." def __init__(self, a=0, **kwargs): self.func = partial(recall_score, average='macro', zero_division=0) self.a = a def reset(self): self.targs,self.preds = [],[] def accumulate(self, learn): pred = learn.pred[self.a].argmax(dim=-1) targ = learn.y[self.a] pred,targ = to_detach(pred),to_detach(targ) pred,targ = flatten_check(pred,targ) self.preds.append(pred) self.targs.append(targ) @property def value(self): if len(self.preds) == 0: return preds,targs = torch.cat(self.preds),torch.cat(self.targs) return self.func(targs, preds) @property def name(self): return train.columns[self.a+1] class RecallCombine(Metric): def accumulate(self, learn): scores = [learn.metrics[i].value for i in range(3)] self.combine = np.average(scores, weights=[2,1,1]) @property def value(self): return self.combine learn = Learner(dls, net, loss_func=CombinationLoss(), metrics=[RecallPartial(a=i) for i in range(len(dls.c))] + [RecallCombine()], ) learn.fit_one_cycle(10, 1e-3)