Notebook

Text Binary Classification based on BERT¶

In [1]:

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive

In [ ]:

!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
     |████████████████████████████████| 4.4 MB 5.2 MB/s 
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
     |████████████████████████████████| 596 kB 53.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
     |████████████████████████████████| 101 kB 10.9 MB/s 
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
     |████████████████████████████████| 6.6 MB 34.8 MB/s 
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)
Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.7.1)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.0)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.11.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (4.1.1)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.8.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.6.15)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.8.1 pyyaml-6.0 tokenizers-0.12.1 transformers-4.20.1

In [ ]:

import transformers
from transformers import AutoModel, AutoTokenizer, BertModel, AutoConfig, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import re
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

%matplotlib inline
%config InlineBackend.figure_format='retina' 

In [ ]:

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Out[ ]:

device(type='cpu')

In [ ]:

torch.cuda.is_available()

Out[ ]:

False

Load Dataset¶

Stanford Sentiment Treebank - Stanford Dataset for predicting Sentiment from longer Movie Reviews. The training set has 67349 rows. The test set has 872 rows.

We only use a subset of dataset due to limitation of computational resources.

In [ ]:

# Stanford Sentiment Treebank
train = pd.read_csv('./drive/MyDrive/Colab Notebooks/huggingface/datasets/sentiment-data/train.tsv', delimiter = '\t')
train = train.sample(12000)
test = pd.read_csv('./drive/MyDrive/Colab Notebooks/huggingface/datasets/sentiment-data/dev.tsv', delimiter = '\t')
# test = test.sample(48)
print("train.shape,test.shape",train.shape,test.shape)

train.shape,test.shape (12000, 2) (872, 2)

In [ ]:

train.head(3)

Out[ ]:

	sentence	label
66730	with outtakes in which most of the characters ...	0
29890	enigma is well-made	1
45801	is ) so stoked to make an important film about...	0

In [ ]:

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 66730 to 32957
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  12000 non-null  object
 1   label     12000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 281.2+ KB

In [ ]:

# no missing value
train[train['sentence'].isna()]

Out[ ]:

	sentence	label

Have a look at the number of each label, and find that the labels are roughly balanced.

In [ ]:

# roughly banlanced labels
train['label'].value_counts()

Out[ ]:

1    6756
0    5244
Name: label, dtype: int64

In [ ]:

sns.countplot(train.label)
plt.xlabel('label count')

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

Out[ ]:

Text(0.5, 0, 'label count')

Have a look at the number of characters in a sentence.

In [ ]:

# number of characters in a sentence
train['sentence'].map(len)

Out[ ]:

66730    145
29890     20
45801    163
29352     52
19858     18
        ... 
60327     26
52950     34
63558     18
11941     41
32957     57
Name: sentence, Length: 12000, dtype: int64

In [ ]:

train['sentence'].map(len).describe()

Out[ ]:

count    12000.000000
mean        53.761500
std         43.217947
min          2.000000
25%         21.000000
50%         39.000000
75%         75.000000
max        261.000000
Name: sentence, dtype: float64

In [ ]:

test['sentence'].map(len).describe()

Out[ ]:

count    872.000000
mean     105.841743
std       48.133928
min        6.000000
25%       68.000000
50%      103.000000
75%      139.000000
max      244.000000
Name: sentence, dtype: float64

In [ ]:

text_len_tr=train['sentence'].map(len)
text_len_tr.plot(kind='kde')

Out[ ]:

<matplotlib.axes._subplots.AxesSubplot at 0x7ff62ebb5710>

In [ ]:

text_len_ts=test['sentence'].map(len)
text_len_ts.plot(kind='kde')

Out[ ]:

<matplotlib.axes._subplots.AxesSubplot at 0x7ff62ed3f850>

In [ ]:

# sentence longer than 300
sum(text_len_tr>300) 
sum(text_len_ts>300) 

Out[ ]:

Use BERT pretrained tokenizer to tokenize the sentences, remove sentence with extremely large number of tokens, and check whether the labels are still balanced.

In [ ]:

PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

def token_len(sen):
  tokens = tokenizer.encode(sen, max_length = 300)
  return len(tokens)

train['token_len'] = train['sentence'].map(token_len)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

In [ ]:

train_select = train.iloc[np.where(train['token_len'] <= 40)].reset_index()
print(train_select.shape)

sns.countplot(train_select.label)
plt.xlabel('label count')

(11785, 4)

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

Out[ ]:

Text(0.5, 0, 'label count')

Tokenization¶

Use BERT pertrained tokenizer to tokenize the sentences, count the number of token in each sentence, and make a distribution plot. We decide to use 40 as the max number of tokens in every sentence. Any sentence with more

In [ ]:

PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [ ]:

# sample_txt = 'i love you'
# len(sample_txt)

# tokens = tokenizer.tokenize(sample_txt)
# token_ids = tokenizer.convert_tokens_to_ids(tokens)

# print(f'Text: {sample_txt}')
# print(f'Tokens: {tokens}')
# print(f'Token ids: {token_ids}')

In [ ]:

# # special tokens
# print(tokenizer.sep_token, tokenizer.sep_token_id)
# print(tokenizer.unk_token, tokenizer.unk_token_id)
# print(tokenizer.pad_token, tokenizer.pad_token_id)
# print(tokenizer.cls_token, tokenizer.cls_token_id)
# print(tokenizer.mask_token, tokenizer.mask_token_id)

In [ ]:

# encoding=tokenizer.encode_plus(
#     sample_txt,
#     # sample_txt_another,
#     max_length=32,
#     add_special_tokens=True,# [CLS]和[SEP]
#     return_token_type_ids=True,
#     padding='max_length',
#     return_attention_mask=True,
#     return_tensors='pt',# Pytorch tensor
# )

# print(len(encoding['input_ids'][0]))
# print(encoding)

# print(len(encoding['attention_mask'][0]))
# print(encoding['attention_mask'])
# encoding.keys()

In [ ]:

# tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [ ]:

token_lens = []

for txt in train_select.sentence:
    tokens = tokenizer.encode(txt, max_length=300)
    token_lens.append(len(tokens))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

In [ ]:

sns.distplot(token_lens)
plt.xlim([0, 60]);
plt.xlabel('Token count');

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

In [ ]:

# max number of tokens in a sentence
max(token_lens)

Out[ ]:

In [ ]:

sns.distplot(token_lens)
plt.xlim([0, 50]);
plt.xlabel('Token count');

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

In [ ]:

MAX_LEN = 40

Construct Dataset¶

Construct a dataset class, given the text, labels, tokenizer, and max len, the outputs are input id and attention mask, which are the input to BERT model. Split dataset into training and test set, and then construct the dataloader with pre-specified batch size 256.

In [ ]:

class SSTDataset(Dataset):
    def __init__(self,texts,labels,tokenizer,max_len):
        self.texts=texts
        self.labels=labels
        self.tokenizer=tokenizer
        self.max_len=max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,item):
        text=str(self.texts[item])
        label=self.labels[item]
        
        encoding=self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
#         print(encoding['input_ids'])
        return {
            'texts':text,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            # toeken_type_ids:0
            'labels':torch.tensor(label,dtype=torch.long)
        }

In [ ]:

# split dataset to train, val, test
df_train, df_test = train_test_split(train_select, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
df_train.shape, df_val.shape, df_test.shape

Out[ ]:

((9428, 4), (1178, 4), (1179, 4))

In [ ]:

def create_data_loader(df,tokenizer,max_len,batch_size):
    ds=SSTDataset(
        texts=df['sentence'].values,
        labels=df['label'].values,
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
#         num_workers=4 # windows multi threads
    )

In [ ]:

BATCH_SIZE = 256

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [ ]:

len(train_data_loader)

Out[ ]:

In [ ]:

# have a look at the data loader
next(iter(train_data_loader))

Out[ ]:

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[  101,  2108,  1037,  ...,     0,     0,     0],
         [  101, 22387,  2125,  ...,     0,     0,     0],
         [  101,  2024,  4394,  ...,     0,     0,     0],
         ...,
         [  101,  1996,  5694,  ...,     0,     0,     0],
         [  101,  5469,  1013,  ...,     0,     0,     0],
         [  101,  1006,  2009,  ...,     0,     0,     0]]),
 'labels': tensor([1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
         1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
         1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
         1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
         1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
         1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
         1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
         0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
         1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
         1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]),
 'texts': ['being a true adaptation of her book ',
  'jerking off ',
  'are missing -- ',
  'little substance ',
  'like bad cinema ',
  'cinematography to the outstanding soundtrack and unconventional ',
  'well-thought ',
  'the film starts promisingly , but the ending is all too predictable and far too cliched to really work ',
  'a movie that understands characters must come first ',
  'a listless climb down the social ladder ',
  'but even while his characters are acting horribly , he is always sympathetic . ',
  'viewing for its courage , ideas , technical proficiency and great acting ',
  'thriller remarkable only for its lack of logic and misuse of two fine actors , morgan freeman and ashley judd ',
  'a pleasant distraction , a friday night diversion , ',
  'almost peerlessly unsettling . ',
  'steals so freely from other movies and combines enough disparate types of films ',
  'has a dashing and resourceful hero ; a lisping , reptilian villain ; big fights ; big hair ; lavish period scenery ; and a story just ',
  'happily ',
  'was immensely enjoyable thanks to great performances by both steve buscemi and rosario dawson ',
  "krawczyk deserves a huge amount of the credit for the film 's thoroughly winning tone ",
  'its excellent use of new york locales and sharp writing ',
  'overwrought ',
  'that the beauty and power of the opera reside primarily in the music itself ',
  'enjoy the movie ',
  'evolve ',
  'eventual cult classic ',
  'robbed ',
  'deceptively simple , ',
  'goddammit ',
  'who cares ? ) . ',
  'devastated ',
  'the silly spy vs. spy film the sum of all fears , starring ben affleck , seem downright hitchcockian ',
  'with amazing finesse ',
  'and pasta-fagioli comedy ',
  'jazzy score ',
  'very good comedic songs ',
  "you forget you 've been to the movies . ",
  'is throwing up his hands in surrender , is firing his r&d people , and has decided he will just screen the master of disguise 24/7 ',
  "a lot to ask people to sit still for two hours and change watching such a character , especially when rendered in as flat and impassive a manner as phoenix 's ",
  'return to never land may be another shameless attempt by disney to rake in dough from baby boomer families ',
  'verges on the amateurish ',
  'hope and magic ',
  'at the expense of character ',
  'generate enough heat ',
  'skip this dreck ',
  "kung pow is oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . ",
  'surrounding us ',
  "there 's nothing like love to give a movie a b-12 shot , ",
  'a welcome relief ',
  'by characters who are nearly impossible to care about ',
  'this quietly lyrical tale probes the ambiguous welcome extended by iran to the afghani refugees who streamed across its borders , desperate for work and food . ',
  'painfully flat gross-out comedy ',
  'recovering from its demented premise ',
  'rarely does a film so graceless and devoid of merit as this one come along . ',
  'richly detailed ',
  'self-consciously flashy camera effects ',
  'original hit movie ',
  'comforting fantasies ',
  'of the condescending stereotypes that so often plague films dealing with the mentally ill ',
  'a low-rent retread ',
  'the sci-fi comedy spectacle as whiffle-ball epic ',
  'liberal doses of dark humor , gorgeous exterior photography , and a stable-full of solid performances ',
  'with the kind of visual flair that shows what great cinema can really do ',
  'boasting some of the most poorly staged and lit action in memory , impostor is as close as you can get to an imitation movie . ',
  'and pat storytelling ',
  "it 's just tediously bad , something to be fully forgotten ",
  'seems so real ',
  'dimness ',
  'almost all of its accumulated enjoyment with a crucial third act miscalculation ',
  "i think it was plato who said , ' i think , therefore i know better than to rush to the theater for this one . ' ",
  'find comfort in familiarity ',
  "burns ' visuals , characters and his punchy dialogue , ",
  'this masterfully calibrated psychological thriller ',
  'to break free of her old life ',
  "the type of film about growing up that we do n't see often enough these days : ",
  "you wish he 'd gone the way of don simpson ",
  'slow , uneventful ',
  "but mr. polanski creates images even more haunting than those in mr. spielberg 's 1993 classic . ",
  "the casting of raymond j. barry as the ` assassin ' greatly enhances the quality of neil burger 's impressive fake documentary . ",
  'the screenplay falls somewhat short ',
  "you 'll want things to work out ",
  'caper thrills ',
  'living far too much in its own head ',
  'a fairly enjoyable mixture of longest yard ... and the 1999 guy ritchie ',
  'elicits strong performances from his cast ',
  'then blood work is for you . ',
  'pretentious types ',
  'not that i mind ugly ; the problem is he has no character , loveable or otherwise . ',
  'on badly-rendered cgi effects ',
  "again ego does n't always go hand in hand with talent ",
  'most romantic comedies ',
  'is the kathie lee gifford of film directors , ',
  'brainless ',
  'brings a beguiling freshness to a coming-of-age story with such a buoyant , expressive flow of images ',
  "like the original , this version is raised a few notches above kiddie fantasy pablum by allen 's astringent wit . ",
  'splendid singing ',
  'a rare window on an artistic collaboration ',
  'satisfying destination ',
  'the film maintains a beguiling serenity and poise that make it accessible for a non-narrative feature . ',
  'be like trying to eat brussels sprouts ',
  "a rumor of angels does n't just slip -- it avalanches into forced fuzziness . ",
  'very compelling tale ',
  'a haunting tale ',
  'is certainly amusing ',
  ', with a solid cast , ',
  'two big things are missing -- anything approaching a visceral kick , and anything approaching even a vague reason to sit through it all . ',
  'unadorned ',
  'human impulses that grew hideously twisted ',
  'of new inspiration in it ',
  'funny , ',
  'the kind of entertainment that parents love to have their kids see . ',
  'a small but rewarding comedy ',
  ", but how long will filmmakers copy the `` saving private ryan '' battle scenes before realizing steven spielberg got it right the first time ? ",
  'an unsettling sight , ',
  'refreshing and comical ',
  'finds a nice rhythm . ',
  'disarmingly ',
  "'s a frightful vanity film that , no doubt , pays off what debt miramax felt they owed to benigni . ",
  'grand fart ',
  'inadvertently sidesplitting it ',
  'me grinning ',
  'worthy of the gong . ',
  'recognize that there are few things in this world more complex -- and , as it turns out , more fragile ',
  'unrewarding all of this ',
  'that trademark grin of his -- so perfect for a ballplayer ',
  "'s hard to take her spiritual quest at all seriously ",
  'being gullible ',
  'typical pax ',
  ', more frantic than involving , more chaotic than entertaining . ',
  'yelling in your face for two hours ',
  'it is different from others in its genre in that it is does not rely on dumb gags , anatomical humor , or character cliches ; ',
  "zhuangzhuang creates delicate balance of style , text , and subtext that 's so simple and precise that anything discordant would topple the balance , ",
  'the film is filled with humorous observations about the general absurdity of modern life as seen through the eyes outsiders ',
  'tender , ',
  "feel a bit too much like an infomercial for ram dass 's latest book aimed at the boomer demographic ",
  'flat-out amusing , sometimes endearing and often fabulous ',
  'comedy that is warm , inviting , and surprising . ',
  "'m giving it thumbs down due to the endlessly repetitive scenes of embarrassment . ",
  'is a light , fun cheese puff of a movie ',
  'disoriented ',
  'that relies on lingering terror punctuated by sudden shocks and not constant bloodshed ',
  "it 's a dish that 's best served cold ",
  'subtly kinky bedside vigils and sensational denouements ',
  'an obvious copy of one of the best films ever made , how could it not be ? ',
  'be considered a funny little film ',
  'show a remarkable ability to document both sides of this emotional car-wreck . ',
  'make the most of the large-screen format ',
  "in his latest effort , storytelling , solondz has finally made a movie that is n't just offensive -- it also happens to be good . ",
  'beyond the cleverness , the weirdness and the pristine camerawork , one hour photo is a sobering meditation on why we take pictures . ',
  "walked out muttering words like `` horrible '' and `` terrible , ",
  'but first , you have to give the audience a reason to want to put for that effort ',
  'if you love the music , and i do , its hard to imagine having more fun watching a documentary ... ',
  'a sharp and quick documentary ',
  'a weird and wonderful comedy . ',
  'something else altogether -- clownish and ',
  'memorable cinematic experience ',
  'play their roles with vibrant charm ',
  'undone by a sloppy script ',
  'bears ... should keep parents amused with its low groan-to-guffaw ratio . ',
  'surprisingly uninvolving ',
  "wilde 's wit and ",
  'penned by a man who has little clue about either the nature of women or of friendship . ',
  'of dark humor , gorgeous exterior photography , and a stable-full of solid performances ',
  'appealing about the characters ',
  'most incoherent ',
  'a powerful drama with enough sardonic wit ',
  'by the power and grace of one of the greatest natural sportsmen of modern times ',
  'expressing itself in every way imaginable ',
  'will enjoy seeing how both evolve ',
  "as gamely as the movie tries to make sense of its title character , there remains a huge gap between the film 's creepy ",
  'the characters sound like real people ',
  'is confused in death to smoochy into something both ugly and mindless . ',
  'escapes the precious trappings of most romantic comedies , infusing into the story very real , ',
  'makes a wonderful subject for the camera . ',
  'dehumanizing and ',
  'of a good movie ye who enter here ',
  'its visual imagination is breathtaking ',
  'one of those decades-spanning historical epics that strives to be intimate and socially encompassing but fails to do justice to either effort in three hours of screen time ',
  'the abiding impression , despite the mild hallucinogenic buzz , is of overwhelming waste -- ',
  "it 's an ambitious film , and ",
  'a sobering meditation ',
  'accessible to a chosen few , standoffish to everyone else , and smugly suggests a superior moral tone is more important than filmmaking skill ',
  'is a lot more bluster than bite . ',
  "features fincher 's characteristically startling visual style and an almost palpable sense of intensity . ",
  'concrete story ',
  "me realize that we really have n't had a good cheesy b-movie playing in theaters since ... well ... since last week 's reign of fire ",
  'the leanest and meanest ',
  'her welcome in her most charmless performance ',
  'is a diverting -- if predictable -- adventure suitable for a matinee , ',
  "a plodding teen remake that 's so mechanical you can smell the grease on the plot twists . ",
  ", with the director taking a hands-off approach when he should have shaped the story to show us why it 's compelling . ",
  'comes off as a kingdom more mild than wild ',
  'even less surprises ',
  "'s crafty , energetic and smart ",
  'some writer dude , i think his name was , uh , michael zaidan , was supposed to have like written the screenplay or something , but , dude , ',
  "could n't ",
  'gripping documentary ',
  'sentimental ooze ',
  'prime ',
  'this ) meandering and pointless french coming-of-age import from writer-director anne-sophie birot ',
  'the genius ',
  'a cruel story ',
  'to the serbs themselves ',
  'picture that at the very least has a spark of life to it -- more than you can say for plenty of movies that flow through the hollywood pipeline without a hitch . ',
  "it seems most likely that broomfield 's interviewees , or even himself , will not be for much longer ",
  'schmaltzy ',
  ', cry and realize , ',
  'powerful drama ',
  'lush scenery ',
  'in its effort to modernize it with encomia to diversity and tolerance ',
  'which somewhat dilutes the pleasure of watching them ',
  "i did n't care . ",
  'funny and wise ',
  ', runteldat is something of a triumph . ',
  "is a tribute to shyamalan 's gifts , which are such that we 'll keep watching the skies for his next project ",
  'all the scenic appeal of a cesspool ',
  'blue hilarity ',
  ", and of telling a fascinating character 's story ",
  'entertaining , if ultimately minor , ',
  'pure ',
  "but it 's a rushed , slapdash , sequel-for-the-sake - of-a-sequel with less than half the plot and ingenuity ",
  'deep-seated , emotional ',
  "'s very beavis and butthead , ",
  'lurid ',
  'few seconds ',
  'of inspired humor ',
  'budding amours ',
  'poignant and funny . ',
  'of effecting change and inspiring hope ',
  'powerful entity ',
  'bad sitcom ',
  'you suffer the dreadfulness of war from both sides ',
  'for most of the characters ',
  '10 ',
  'sour ',
  'is almost entirely witless and inane , ',
  'has created a film that one can honestly describe as looking , sounding and simply feeling like no other film in recent history . ',
  'play second fiddle to the dull effects that allow the suit to come to life ',
  "ticks off kahlo 's lifetime milestones with the dutiful precision of a tax accountant ",
  'a surprisingly juvenile lark , ',
  'making this gorgeous film a must for everyone from junior scientists to grown-up fish lovers ',
  'i heard a mysterious voice , and felt myself powerfully drawn toward the light -- the light of the exit sign ',
  'the murder ',
  'fantastically ',
  'for a few comic turns ',
  'is just a corny examination of a young actress trying to find her way ',
  'artistic aspirations ',
  'o fantasma is boldly , confidently orchestrated , aesthetically and sexually , and its impact is deeply and rightly disturbing ',
  "'ll trudge out of the theater feeling as though you rode the zipper after eating a corn dog and an extra-large cotton candy ",
  'plastered with one hollywood cliche ',
  "'s a hellish ",
  'a sleep-inducingly slow-paced crime drama with clumsy dialogue , heavy-handed phoney-feeling sentiment , and an overly-familiar set ',
  'mr. murray , a prolific director of music videos , stuffs his debut with more plot than it can comfortably hold . ',
  'the victims ',
  'horror/action hybrid ',
  "( it 's ) what punk rock music used to be , and "]}

In [ ]:

data = next(iter(train_data_loader))
data.keys()

Out[ ]:

dict_keys(['texts', 'input_ids', 'attention_mask', 'labels'])

In [ ]:

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

torch.Size([256, 40])
torch.Size([256, 40])
torch.Size([256])

Build Model¶

Load pretrained BERT model and construct a classifier, specify 2 neurons in the output layer. So the classifier contains the BERT model, takes the dataset, and output the predicted classes.

In [ ]:

bert_model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Out[ ]:

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (2): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (3): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (4): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (5): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (6): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (7): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (8): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (10): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (11): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)

In [ ]:

# last_hidden_state, pooled_output = bert_model(
#     input_ids=encoding['input_ids'], 
#     attention_mask=encoding['attention_mask'],
#     return_dict = False
# )

# last_hidden_state # vectors of tokens
# pooled_output
# print(last_hidden_state.shape)
# print(pooled_output.shape)
# print(bert_model.config.hidden_size)
# print(pooled_output.shape)

In [ ]:

class SSTClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SSTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes) # two classes
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict = False
        )
        output = self.drop(pooled_output) # dropout
        return self.out(output)

In [ ]:

# change the number of output neuron
class_names=[0,1]

In [ ]:

model = SSTClassifier(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

In [ ]:

data

Out[ ]:

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[  101,  2108,  1037,  ...,     0,     0,     0],
         [  101, 22387,  2125,  ...,     0,     0,     0],
         [  101,  2024,  4394,  ...,     0,     0,     0],
         ...,
         [  101,  1996,  5694,  ...,     0,     0,     0],
         [  101,  5469,  1013,  ...,     0,     0,     0],
         [  101,  1006,  2009,  ...,     0,     0,     0]]),
 'labels': tensor([1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
         1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
         1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
         1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
         1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
         1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
         1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
         0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
         1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
         1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]),
 'texts': ['being a true adaptation of her book ',
  'jerking off ',
  'are missing -- ',
  'little substance ',
  'like bad cinema ',
  'cinematography to the outstanding soundtrack and unconventional ',
  'well-thought ',
  'the film starts promisingly , but the ending is all too predictable and far too cliched to really work ',
  'a movie that understands characters must come first ',
  'a listless climb down the social ladder ',
  'but even while his characters are acting horribly , he is always sympathetic . ',
  'viewing for its courage , ideas , technical proficiency and great acting ',
  'thriller remarkable only for its lack of logic and misuse of two fine actors , morgan freeman and ashley judd ',
  'a pleasant distraction , a friday night diversion , ',
  'almost peerlessly unsettling . ',
  'steals so freely from other movies and combines enough disparate types of films ',
  'has a dashing and resourceful hero ; a lisping , reptilian villain ; big fights ; big hair ; lavish period scenery ; and a story just ',
  'happily ',
  'was immensely enjoyable thanks to great performances by both steve buscemi and rosario dawson ',
  "krawczyk deserves a huge amount of the credit for the film 's thoroughly winning tone ",
  'its excellent use of new york locales and sharp writing ',
  'overwrought ',
  'that the beauty and power of the opera reside primarily in the music itself ',
  'enjoy the movie ',
  'evolve ',
  'eventual cult classic ',
  'robbed ',
  'deceptively simple , ',
  'goddammit ',
  'who cares ? ) . ',
  'devastated ',
  'the silly spy vs. spy film the sum of all fears , starring ben affleck , seem downright hitchcockian ',
  'with amazing finesse ',
  'and pasta-fagioli comedy ',
  'jazzy score ',
  'very good comedic songs ',
  "you forget you 've been to the movies . ",
  'is throwing up his hands in surrender , is firing his r&d people , and has decided he will just screen the master of disguise 24/7 ',
  "a lot to ask people to sit still for two hours and change watching such a character , especially when rendered in as flat and impassive a manner as phoenix 's ",
  'return to never land may be another shameless attempt by disney to rake in dough from baby boomer families ',
  'verges on the amateurish ',
  'hope and magic ',
  'at the expense of character ',
  'generate enough heat ',
  'skip this dreck ',
  "kung pow is oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . ",
  'surrounding us ',
  "there 's nothing like love to give a movie a b-12 shot , ",
  'a welcome relief ',
  'by characters who are nearly impossible to care about ',
  'this quietly lyrical tale probes the ambiguous welcome extended by iran to the afghani refugees who streamed across its borders , desperate for work and food . ',
  'painfully flat gross-out comedy ',
  'recovering from its demented premise ',
  'rarely does a film so graceless and devoid of merit as this one come along . ',
  'richly detailed ',
  'self-consciously flashy camera effects ',
  'original hit movie ',
  'comforting fantasies ',
  'of the condescending stereotypes that so often plague films dealing with the mentally ill ',
  'a low-rent retread ',
  'the sci-fi comedy spectacle as whiffle-ball epic ',
  'liberal doses of dark humor , gorgeous exterior photography , and a stable-full of solid performances ',
  'with the kind of visual flair that shows what great cinema can really do ',
  'boasting some of the most poorly staged and lit action in memory , impostor is as close as you can get to an imitation movie . ',
  'and pat storytelling ',
  "it 's just tediously bad , something to be fully forgotten ",
  'seems so real ',
  'dimness ',
  'almost all of its accumulated enjoyment with a crucial third act miscalculation ',
  "i think it was plato who said , ' i think , therefore i know better than to rush to the theater for this one . ' ",
  'find comfort in familiarity ',
  "burns ' visuals , characters and his punchy dialogue , ",
  'this masterfully calibrated psychological thriller ',
  'to break free of her old life ',
  "the type of film about growing up that we do n't see often enough these days : ",
  "you wish he 'd gone the way of don simpson ",
  'slow , uneventful ',
  "but mr. polanski creates images even more haunting than those in mr. spielberg 's 1993 classic . ",
  "the casting of raymond j. barry as the ` assassin ' greatly enhances the quality of neil burger 's impressive fake documentary . ",
  'the screenplay falls somewhat short ',
  "you 'll want things to work out ",
  'caper thrills ',
  'living far too much in its own head ',
  'a fairly enjoyable mixture of longest yard ... and the 1999 guy ritchie ',
  'elicits strong performances from his cast ',
  'then blood work is for you . ',
  'pretentious types ',
  'not that i mind ugly ; the problem is he has no character , loveable or otherwise . ',
  'on badly-rendered cgi effects ',
  "again ego does n't always go hand in hand with talent ",
  'most romantic comedies ',
  'is the kathie lee gifford of film directors , ',
  'brainless ',
  'brings a beguiling freshness to a coming-of-age story with such a buoyant , expressive flow of images ',
  "like the original , this version is raised a few notches above kiddie fantasy pablum by allen 's astringent wit . ",
  'splendid singing ',
  'a rare window on an artistic collaboration ',
  'satisfying destination ',
  'the film maintains a beguiling serenity and poise that make it accessible for a non-narrative feature . ',
  'be like trying to eat brussels sprouts ',
  "a rumor of angels does n't just slip -- it avalanches into forced fuzziness . ",
  'very compelling tale ',
  'a haunting tale ',
  'is certainly amusing ',
  ', with a solid cast , ',
  'two big things are missing -- anything approaching a visceral kick , and anything approaching even a vague reason to sit through it all . ',
  'unadorned ',
  'human impulses that grew hideously twisted ',
  'of new inspiration in it ',
  'funny , ',
  'the kind of entertainment that parents love to have their kids see . ',
  'a small but rewarding comedy ',
  ", but how long will filmmakers copy the `` saving private ryan '' battle scenes before realizing steven spielberg got it right the first time ? ",
  'an unsettling sight , ',
  'refreshing and comical ',
  'finds a nice rhythm . ',
  'disarmingly ',
  "'s a frightful vanity film that , no doubt , pays off what debt miramax felt they owed to benigni . ",
  'grand fart ',
  'inadvertently sidesplitting it ',
  'me grinning ',
  'worthy of the gong . ',
  'recognize that there are few things in this world more complex -- and , as it turns out , more fragile ',
  'unrewarding all of this ',
  'that trademark grin of his -- so perfect for a ballplayer ',
  "'s hard to take her spiritual quest at all seriously ",
  'being gullible ',
  'typical pax ',
  ', more frantic than involving , more chaotic than entertaining . ',
  'yelling in your face for two hours ',
  'it is different from others in its genre in that it is does not rely on dumb gags , anatomical humor , or character cliches ; ',
  "zhuangzhuang creates delicate balance of style , text , and subtext that 's so simple and precise that anything discordant would topple the balance , ",
  'the film is filled with humorous observations about the general absurdity of modern life as seen through the eyes outsiders ',
  'tender , ',
  "feel a bit too much like an infomercial for ram dass 's latest book aimed at the boomer demographic ",
  'flat-out amusing , sometimes endearing and often fabulous ',
  'comedy that is warm , inviting , and surprising . ',
  "'m giving it thumbs down due to the endlessly repetitive scenes of embarrassment . ",
  'is a light , fun cheese puff of a movie ',
  'disoriented ',
  'that relies on lingering terror punctuated by sudden shocks and not constant bloodshed ',
  "it 's a dish that 's best served cold ",
  'subtly kinky bedside vigils and sensational denouements ',
  'an obvious copy of one of the best films ever made , how could it not be ? ',
  'be considered a funny little film ',
  'show a remarkable ability to document both sides of this emotional car-wreck . ',
  'make the most of the large-screen format ',
  "in his latest effort , storytelling , solondz has finally made a movie that is n't just offensive -- it also happens to be good . ",
  'beyond the cleverness , the weirdness and the pristine camerawork , one hour photo is a sobering meditation on why we take pictures . ',
  "walked out muttering words like `` horrible '' and `` terrible , ",
  'but first , you have to give the audience a reason to want to put for that effort ',
  'if you love the music , and i do , its hard to imagine having more fun watching a documentary ... ',
  'a sharp and quick documentary ',
  'a weird and wonderful comedy . ',
  'something else altogether -- clownish and ',
  'memorable cinematic experience ',
  'play their roles with vibrant charm ',
  'undone by a sloppy script ',
  'bears ... should keep parents amused with its low groan-to-guffaw ratio . ',
  'surprisingly uninvolving ',
  "wilde 's wit and ",
  'penned by a man who has little clue about either the nature of women or of friendship . ',
  'of dark humor , gorgeous exterior photography , and a stable-full of solid performances ',
  'appealing about the characters ',
  'most incoherent ',
  'a powerful drama with enough sardonic wit ',
  'by the power and grace of one of the greatest natural sportsmen of modern times ',
  'expressing itself in every way imaginable ',
  'will enjoy seeing how both evolve ',
  "as gamely as the movie tries to make sense of its title character , there remains a huge gap between the film 's creepy ",
  'the characters sound like real people ',
  'is confused in death to smoochy into something both ugly and mindless . ',
  'escapes the precious trappings of most romantic comedies , infusing into the story very real , ',
  'makes a wonderful subject for the camera . ',
  'dehumanizing and ',
  'of a good movie ye who enter here ',
  'its visual imagination is breathtaking ',
  'one of those decades-spanning historical epics that strives to be intimate and socially encompassing but fails to do justice to either effort in three hours of screen time ',
  'the abiding impression , despite the mild hallucinogenic buzz , is of overwhelming waste -- ',
  "it 's an ambitious film , and ",
  'a sobering meditation ',
  'accessible to a chosen few , standoffish to everyone else , and smugly suggests a superior moral tone is more important than filmmaking skill ',
  'is a lot more bluster than bite . ',
  "features fincher 's characteristically startling visual style and an almost palpable sense of intensity . ",
  'concrete story ',
  "me realize that we really have n't had a good cheesy b-movie playing in theaters since ... well ... since last week 's reign of fire ",
  'the leanest and meanest ',
  'her welcome in her most charmless performance ',
  'is a diverting -- if predictable -- adventure suitable for a matinee , ',
  "a plodding teen remake that 's so mechanical you can smell the grease on the plot twists . ",
  ", with the director taking a hands-off approach when he should have shaped the story to show us why it 's compelling . ",
  'comes off as a kingdom more mild than wild ',
  'even less surprises ',
  "'s crafty , energetic and smart ",
  'some writer dude , i think his name was , uh , michael zaidan , was supposed to have like written the screenplay or something , but , dude , ',
  "could n't ",
  'gripping documentary ',
  'sentimental ooze ',
  'prime ',
  'this ) meandering and pointless french coming-of-age import from writer-director anne-sophie birot ',
  'the genius ',
  'a cruel story ',
  'to the serbs themselves ',
  'picture that at the very least has a spark of life to it -- more than you can say for plenty of movies that flow through the hollywood pipeline without a hitch . ',
  "it seems most likely that broomfield 's interviewees , or even himself , will not be for much longer ",
  'schmaltzy ',
  ', cry and realize , ',
  'powerful drama ',
  'lush scenery ',
  'in its effort to modernize it with encomia to diversity and tolerance ',
  'which somewhat dilutes the pleasure of watching them ',
  "i did n't care . ",
  'funny and wise ',
  ', runteldat is something of a triumph . ',
  "is a tribute to shyamalan 's gifts , which are such that we 'll keep watching the skies for his next project ",
  'all the scenic appeal of a cesspool ',
  'blue hilarity ',
  ", and of telling a fascinating character 's story ",
  'entertaining , if ultimately minor , ',
  'pure ',
  "but it 's a rushed , slapdash , sequel-for-the-sake - of-a-sequel with less than half the plot and ingenuity ",
  'deep-seated , emotional ',
  "'s very beavis and butthead , ",
  'lurid ',
  'few seconds ',
  'of inspired humor ',
  'budding amours ',
  'poignant and funny . ',
  'of effecting change and inspiring hope ',
  'powerful entity ',
  'bad sitcom ',
  'you suffer the dreadfulness of war from both sides ',
  'for most of the characters ',
  '10 ',
  'sour ',
  'is almost entirely witless and inane , ',
  'has created a film that one can honestly describe as looking , sounding and simply feeling like no other film in recent history . ',
  'play second fiddle to the dull effects that allow the suit to come to life ',
  "ticks off kahlo 's lifetime milestones with the dutiful precision of a tax accountant ",
  'a surprisingly juvenile lark , ',
  'making this gorgeous film a must for everyone from junior scientists to grown-up fish lovers ',
  'i heard a mysterious voice , and felt myself powerfully drawn toward the light -- the light of the exit sign ',
  'the murder ',
  'fantastically ',
  'for a few comic turns ',
  'is just a corny examination of a young actress trying to find her way ',
  'artistic aspirations ',
  'o fantasma is boldly , confidently orchestrated , aesthetically and sexually , and its impact is deeply and rightly disturbing ',
  "'ll trudge out of the theater feeling as though you rode the zipper after eating a corn dog and an extra-large cotton candy ",
  'plastered with one hollywood cliche ',
  "'s a hellish ",
  'a sleep-inducingly slow-paced crime drama with clumsy dialogue , heavy-handed phoney-feeling sentiment , and an overly-familiar set ',
  'mr. murray , a prolific director of music videos , stuffs his debut with more plot than it can comfortably hold . ',
  'the victims ',
  'horror/action hybrid ',
  "( it 's ) what punk rock music used to be , and "]}

In [ ]:

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

# model(input_ids, attention_mask)
# F.softmax(model(input_ids, attention_mask), dim=1)

torch.Size([256, 40])
torch.Size([256, 40])

Fine Tune the BERT Model¶

AdamW optimizer is the Adam algorithm with weight decay. It regularizes variables with large gradients more than L2 regularization would, which was shown to yield better training loss and generalization error (indicated in the paper). Small learning rate aimed for fine tuning the model.

'get_linear_schedule_with_warmup' creates a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

In each epoch, we train the model on training data and validate it on validation data. There are 3 epochs in total. And it takes hours to complete. After training the model, we evaluate the model using a held-out test data. Eventually the model achieved >90% accuracy.

In [ ]:

EPOCHS = 3

optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,

In [ ]:

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train() 
    losses = []
    correct_predictions = 0
    print('start')
    
    for ind_, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        
        print(f'run model: {ind_}')
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [ ]:

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval() 

    losses = []
    correct_predictions = 0
    print('start')
    with torch.no_grad():
        for ind_, d in enumerate(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            print(f'run model: {ind_}')
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [ ]:

# torch.save(model.state_dict(), './drive/MyDrive/Colab Notebooks/huggingface/best_model_state.bin')

In [ ]:

history = defaultdict(list) 
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )

    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), './drive/MyDrive/Colab Notebooks/huggingface/best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/3
----------
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
run model: 5
run model: 6
run model: 7
run model: 8
run model: 9
run model: 10
run model: 11
run model: 12
run model: 13
run model: 14
run model: 15
run model: 16
run model: 17
run model: 18
run model: 19
run model: 20
run model: 21
run model: 22
run model: 23
run model: 24
run model: 25
run model: 26
run model: 27
run model: 28
run model: 29
run model: 30
run model: 31
run model: 32
run model: 33
run model: 34
run model: 35
run model: 36
Train loss 0.41595339654265223 accuracy 0.7976240984302079
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
Val loss 0.24612324237823485 accuracy 0.9032258064516129

Epoch 2/3
----------
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
run model: 5
run model: 6
run model: 7
run model: 8
run model: 9
run model: 10
run model: 11
run model: 12
run model: 13
run model: 14
run model: 15
run model: 16
run model: 17
run model: 18
run model: 19
run model: 20
run model: 21
run model: 22
run model: 23
run model: 24
run model: 25
run model: 26
run model: 27
run model: 28
run model: 29
run model: 30
run model: 31
run model: 32
run model: 33
run model: 34
run model: 35
run model: 36
Train loss 0.18753797158196167 accuracy 0.9304200254560883
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
Val loss 0.23452322781085969 accuracy 0.9134125636672326

Epoch 3/3
----------
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
run model: 5
run model: 6
run model: 7
run model: 8
run model: 9
run model: 10
run model: 11
run model: 12
run model: 13
run model: 14
run model: 15
run model: 16
run model: 17
run model: 18
run model: 19
run model: 20
run model: 21
run model: 22
run model: 23
run model: 24
run model: 25
run model: 26
run model: 27
run model: 28
run model: 29
run model: 30
run model: 31
run model: 32
run model: 33
run model: 34
run model: 35
run model: 36
Train loss 0.12577906734234579 accuracy 0.9570428510818838
start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4
Val loss 0.2435760974884033 accuracy 0.9134125636672326

In [ ]:

plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

Evaluation and Prediction¶

In [ ]:

test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

start
run model: 0
run model: 1
run model: 2
run model: 3
run model: 4

Out[ ]:

0.9168787107718406

In [ ]:

def get_predictions(model, data_loader):
    model = model.eval()

    raw_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["texts"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1) 

            probs = F.softmax(outputs, dim=1) 

            raw_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return raw_texts, predictions, prediction_probs, real_values

In [ ]:

y_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [ ]:

print(classification_report(y_test, y_pred, target_names=[str(label) for label in class_names])) 

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       529
           1       0.94      0.90      0.92       650

    accuracy                           0.92      1179
   macro avg       0.92      0.92      0.92      1179
weighted avg       0.92      0.92      0.92      1179

In [ ]:

def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True label')
    plt.xlabel('Predicted label');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)