In [1]:

import torch
from transformers import BertTokenizerFast, BertModel, BertConfig, BertTokenizer
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tqdm
import glob
import os
from sklearn.decomposition import PCA
import time
import plotly.express as px
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist, cosine
from gpytorch.kernels.rq_kernel import RQKernel
import torch.nn as nn
from torch.nn import functional as F
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

For computing representation on fine-tuned model¶

In [2]:

class BertClassifier(torch.nn.Module):
    
    def __init__(self, config, model, dim=256, num_classes=2):
        super(BertClassifier, self).__init__()
        
        # create the model config and BERT initialize the pretrained BERT, also layers wise outputs
        self.config = config
        self.base = model
        
        # classifier head [not useful]
        self.head = torch.nn.Sequential(*[
            torch.nn.Dropout(p=self.config.hidden_dropout_prob),
            torch.nn.Linear(in_features=self.config.hidden_size, out_features=dim),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.config.hidden_dropout_prob),
            torch.nn.Linear(in_features=dim, out_features=num_classes)
        ])
    
    def forward(self, input_ids, attention_mask=None):
        
        # first output is top layer output, second output is context of input seq and third output will be layerwise tokens 
        top_layer, pooled, layers = self.base(input_ids, attention_mask)
        outputs = self.head(pooled)
        return top_layer, outputs, layers

In [3]:

class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.text = df.review_text.values
        self.max_len =  max_len
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text  = self.text[idx]
        
        # encode the text and target into tensors return the attention masks as well
        encoding = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
        }
        

In [4]:

def model_predict(trained):
        dictionary_list = []

        df = pd.read_csv("./amazon-review/dvd-UL.csv")
        df = df.sample(n=1000, random_state=42) #number of samples
        df = df.reset_index(drop=True)

        dataset = SentimentDataset(df=df, tokenizer=tokenizer)

        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size= 4,
            shuffle=False,
            num_workers=8
        )

        for bi, d in enumerate(tqdm(data_loader)):
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]

            _, _, output = classifier_trained(input_ids, attention_mask)

            output = output[1:]

            for zeta in range(len(output[0])):
                for i in range(0,12):
                    new_row = {'embeddings':output[i][zeta][0].cpu().detach().numpy(), 'layers': i+1}
                    dictionary_list.append(new_row)

            dictionary_list = np.save(f"./data/batch_{bi}", dictionary_list, allow_pickle=True)
            dictionary_list = []

In [5]:

if(os.path.exists("./data")):
    files = glob.glob('./data/*')
    for f in files:
        os.remove(f)
else:
    os.makedirs("./data")

In [6]:

PATH =  "books"+".pt" #change the model name here

model_name = "bert-base-uncased"
config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
bert = BertModel.from_pretrained(model_name, config=config)

classifier_trained = BertClassifier(config=config, model=bert, num_classes=2)
classifier_trained.load_state_dict(torch.load(PATH))

model_predict(classifier_trained)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

In [7]:

dictionary_list = []

files = glob.glob("./data/*.npy")

for j in range(len(files)):
    alpha = np.load(f"./data/batch_{j}.npy", allow_pickle = True)
    for i in range(len(alpha)):
        new_row = {'embeddings':alpha[i]["embeddings"], 'layers': alpha[i]["layers"]}
        dictionary_list.append(new_row)

In [8]:

df = pd.DataFrame.from_dict(dictionary_list)
df

Out[8]:

	embeddings	layers
0	[0.029436039, 0.06670721, -0.22471415, -0.2367...	1
1	[-0.1554519, -0.21112284, -0.3408423, -0.20209...	2
2	[-0.12095504, -0.36359823, -0.17967358, -0.109...	3
3	[-0.21423775, -0.7461651, -0.6160757, -0.30794...	4
4	[-0.4974339, -0.85912985, -0.42627215, -0.5099...	5
...	...	...
11995	[0.51743513, -0.6500383, -0.68353117, -0.22525...	8
11996	[0.42627597, -0.63389504, -0.19636014, -0.2719...	9
11997	[0.025212316, -0.5110682, 0.48476753, -0.35641...	10
11998	[0.04342799, -0.75802934, 0.5390331, -0.213192...	11
11999	[-0.29624316, -0.9558969, 0.48933977, -0.35488...	12

12000 rows × 2 columns

In [9]:

df.to_pickle("./dvd-ft-b.pkl") #naming convention dvd-ft-b (dvd dataset on books finetuned)

For computing representation on pre-trained model¶

In [10]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_lower_case=True)
config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
model = BertModel.from_pretrained('bert-base-uncased', config=config)

In [11]:

if(os.path.exists("./data")):
    files = glob.glob('./data/*')
    for f in files:
        os.remove(f)
else:
    os.makedirs("./data")

In [13]:

files = "./amazon-review/dvd-UL.csv"
dataset = pd.read_csv(files)
dataset = dataset.sample(n=1000, random_state=42)
dataset = dataset.reset_index(drop=True)

In [14]:

dictionary_list = []
count = 0

for i in tqdm(range(len(dataset))):
            encoding = tokenizer(dataset["review_text"][i], return_tensors='pt', padding='max_length', truncation=True, max_length=180)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            outputs = model(input_ids, attention_mask)
            hidden_states = outputs[2][1:]

            batch_text = []

            for j in range(0,12):
                new_row = {'embeddings':hidden_states[j][0][0].cpu().detach().numpy(), 'layers': j+1}
                dictionary_list.append(new_row)

            dictionary_list = np.save(f"./data/batch_{count}", dictionary_list, allow_pickle=True)
            dictionary_list = []
            count += 1

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

In [15]:

dictionary_list = []

files = glob.glob("./data/*.npy")

for j in range(len(files)):
    alpha = np.load(f"./data/batch_{j}.npy", allow_pickle = True)
    for i in range(len(alpha)):
        new_row = {'embeddings':alpha[i]["embeddings"], 'layers': alpha[i]["layers"]}
        dictionary_list.append(new_row)

In [16]:

df = pd.DataFrame.from_dict(dictionary_list)
df.to_pickle("./dvd-pt.pkl") ##naming convention dvd-pt (dvd dataset on pretrain model)

In [8]:

df.layers.unique()

Out[8]:

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

Visualisation for sanity check¶

In [25]:

df = pd.read_pickle("./books-ft-b.pkl")

In [23]:

df = pd.read_pickle("./books-pt.pkl")

In [26]:

mat = np.matrix([x for x in df.embeddings])
start = time.time()
pca = PCA(n_components=3, random_state=42)
components = pca.fit_transform(mat)
print(time.time()-start)

0.5051431655883789

In [21]:

fig.write_html("3D_PCA_FT.html")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-0e04dee09881> in <module>
----> 1 fig.write_html("3D_PCA.html")

NameError: name 'fig' is not defined

In [ ]:

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['layers'],
    title=f'Domain representation plotting PCA',
    labels={'color': 'layers'}
)
fig.update_layout(
    autosize=False,
    width=1080,
    height=720,)

fig.show()

In [ ]:

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['layers'],
    title=f'Domain representation plotting PCA',
    labels={'color': 'layers'}
)
fig.update_layout(
    autosize=False,
    width=1080,
    height=720,)

fig.show()

Load pickles and calculate RSA¶

In [18]:

df = pd.read_pickle("./dvd-pt.pkl")
df1 = pd.read_pickle("./dvd-ft-b.pkl")

In [19]:

def mmd(first_tensor: torch.Tensor, second_tensor: torch.tensor, kernel) -> float:
    """ MMD will be calculated between two sets of tensors
    Parameters
    ----------
    first_tensor : torch.Tensor
        Tensor of shape batch_size, m
        m is the dimension of the first tensor
    second_tensor: torch.Tensor
        Tensor of shape batch_size, n
        n is the dimension of the target tensor 
    Returns
    -------
    float
        MMD between two samples
    """
    first_tensor_correlation = kernel(first_tensor, first_tensor).evaluate()
    second_tensor_correlation = kernel(second_tensor, second_tensor).evaluate()
    first_second_tensor_correlation = kernel(first_tensor, second_tensor).evaluate()
    m = first_tensor.size(0)
    n = second_tensor.size(0)
    sum_first_corr = first_tensor_correlation.sum().item()
    sum_second_corr = second_tensor_correlation.sum().item()
    sum_first_second_corr = first_second_tensor_correlation.sum().item()
    first_term = (1 / (m ** 2)) * sum_first_corr
    second_term = (1 / (n ** 2)) * sum_second_corr
    third_term = (2 / (m * n)) * sum_first_second_corr
    divergence = first_term + second_term - third_term
    return divergence

In [20]:

def closest_rows(a):
    # Get euclidean distances as 2D array
    dists = cdist(a, a, 'cosine')

    # Fill diagonals with something greater than all elements as we intend
    # to get argmin indices later on and then index into input array with those
    # indices to get the closest rows
#     dists.ravel()[::dists.shape[1]+1] = dists.max()+1
    return dists

In [21]:

pretrained_books_dvd_RSA = {}
pretrained_books_dvd_MMD = {} #for storing MMD between 2 models for each layer

In [22]:

for i in tqdm(range(1,13)):
    temp1 = df[df["layers"]==i]
    temp2 = df1[df1["layers"]==i] #selecting a specific layer
    
    lis1 = temp1["embeddings"].tolist()
    lis2 = temp2["embeddings"].tolist()
    
    a = np.array(lis1)
    b = np.array(lis2)
    
    result1 = closest_rows(a)
    result2 = closest_rows(b)
    
    pretrained_books_dvd_RSA[i] = np.corrcoef(result1.flatten(), result2.flatten())[0,1]

    result1 = torch.from_numpy(result1)
    result2 = torch.from_numpy(result2)
    kernel = RQKernel()
    val = mmd(result1, result2, kernel)
    
    pretrained_books_dvd_MMD[i] = val

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

In [23]:

RSA between bert representation and fine tuned bert (books) - DVD dataset

Out[23]:

{1: 0.955398855024296,
 2: 0.9499259151146981,
 3: 0.9346616069127789,
 4: 0.9586915720754712,
 5: 0.9568238275750086,
 6: 0.9182314339773345,
 7: 0.8438659922811926,
 8: 0.8498103209022732,
 9: 0.8472093680343032,
 10: 0.5935006317709525,
 11: 0.37255983521918024,
 12: 0.15755181885614458}

In [15]:

RSA between bert representation and fine tuned bert (books) - books dataset

Out[15]:

{1: 0.9527077794274325,
 2: 0.9477798799964473,
 3: 0.9226848173618823,
 4: 0.9463699218593898,
 5: 0.9197475682544565,
 6: 0.8904403027804,
 7: 0.8978173700287129,
 8: 0.8799731485650455,
 9: 0.8353729201603369,
 10: 0.5456118638478861,
 11: 0.33934223629028293,
 12: 0.1056740137191123}

In [ ]:

MMD between bert representation and fine tuned bert (books) - DVD dataset

In [ ]:

{1: 0.025523517691246278,
0.017031062632806515,
0.034405121243115344,
0.0743630493421803,
0.0658974319876604,
0.21873141347150904,
0.23004184150353701,
0.16440297895429068,
0.11399606384908612,
0.2680937669944564,
0.31493303470159223,
0.397320439848271}

In [ ]:

MMD between bert representation and fine tuned bert (books) - books dataset

In [ ]:

{1: 0.008592661067214902,
0.008466682016583071,
0.01607070255527243,
0.03613209326039435,
0.059290986788250954,
0.0553308317517649,
0.0686747228871778,
0.060072450953288725,
0.09658271662444762,
0.2985128726786279,
0.3239971706890749,
0.4579824415026016}