import torch
from transformers import BertTokenizerFast, BertModel, BertConfig, BertTokenizer
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tqdm
import glob
import os
from sklearn.decomposition import PCA
import time
import plotly.express as px
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist, cosine
from gpytorch.kernels.rq_kernel import RQKernel
import torch.nn as nn
from torch.nn import functional as F
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
class BertClassifier(torch.nn.Module):
def __init__(self, config, model, dim=256, num_classes=2):
super(BertClassifier, self).__init__()
# create the model config and BERT initialize the pretrained BERT, also layers wise outputs
self.config = config
self.base = model
# classifier head [not useful]
self.head = torch.nn.Sequential(*[
torch.nn.Dropout(p=self.config.hidden_dropout_prob),
torch.nn.Linear(in_features=self.config.hidden_size, out_features=dim),
torch.nn.ReLU(),
torch.nn.Dropout(p=self.config.hidden_dropout_prob),
torch.nn.Linear(in_features=dim, out_features=num_classes)
])
def forward(self, input_ids, attention_mask=None):
# first output is top layer output, second output is context of input seq and third output will be layerwise tokens
top_layer, pooled, layers = self.base(input_ids, attention_mask)
outputs = self.head(pooled)
return top_layer, outputs, layers
class SentimentDataset(Dataset):
def __init__(self, df, tokenizer, max_len=512):
self.tokenizer = tokenizer
self.text = df.review_text.values
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
text = self.text[idx]
# encode the text and target into tensors return the attention masks as well
encoding = self.tokenizer.encode_plus(
text=text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
truncation=True
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
}
def model_predict(trained):
dictionary_list = []
df = pd.read_csv("./amazon-review/dvd-UL.csv")
df = df.sample(n=1000, random_state=42) #number of samples
df = df.reset_index(drop=True)
dataset = SentimentDataset(df=df, tokenizer=tokenizer)
data_loader = torch.utils.data.DataLoader(
dataset=dataset,
batch_size= 4,
shuffle=False,
num_workers=8
)
for bi, d in enumerate(tqdm(data_loader)):
input_ids = d["input_ids"]
attention_mask = d["attention_mask"]
_, _, output = classifier_trained(input_ids, attention_mask)
output = output[1:]
for zeta in range(len(output[0])):
for i in range(0,12):
new_row = {'embeddings':output[i][zeta][0].cpu().detach().numpy(), 'layers': i+1}
dictionary_list.append(new_row)
dictionary_list = np.save(f"./data/batch_{bi}", dictionary_list, allow_pickle=True)
dictionary_list = []
if(os.path.exists("./data")):
files = glob.glob('./data/*')
for f in files:
os.remove(f)
else:
os.makedirs("./data")
PATH = "books"+".pt" #change the model name here
model_name = "bert-base-uncased"
config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
bert = BertModel.from_pretrained(model_name, config=config)
classifier_trained = BertClassifier(config=config, model=bert, num_classes=2)
classifier_trained.load_state_dict(torch.load(PATH))
model_predict(classifier_trained)
HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))
dictionary_list = []
files = glob.glob("./data/*.npy")
for j in range(len(files)):
alpha = np.load(f"./data/batch_{j}.npy", allow_pickle = True)
for i in range(len(alpha)):
new_row = {'embeddings':alpha[i]["embeddings"], 'layers': alpha[i]["layers"]}
dictionary_list.append(new_row)
df = pd.DataFrame.from_dict(dictionary_list)
df
embeddings | layers | |
---|---|---|
0 | [0.029436039, 0.06670721, -0.22471415, -0.2367... | 1 |
1 | [-0.1554519, -0.21112284, -0.3408423, -0.20209... | 2 |
2 | [-0.12095504, -0.36359823, -0.17967358, -0.109... | 3 |
3 | [-0.21423775, -0.7461651, -0.6160757, -0.30794... | 4 |
4 | [-0.4974339, -0.85912985, -0.42627215, -0.5099... | 5 |
... | ... | ... |
11995 | [0.51743513, -0.6500383, -0.68353117, -0.22525... | 8 |
11996 | [0.42627597, -0.63389504, -0.19636014, -0.2719... | 9 |
11997 | [0.025212316, -0.5110682, 0.48476753, -0.35641... | 10 |
11998 | [0.04342799, -0.75802934, 0.5390331, -0.213192... | 11 |
11999 | [-0.29624316, -0.9558969, 0.48933977, -0.35488... | 12 |
12000 rows × 2 columns
df.to_pickle("./dvd-ft-b.pkl") #naming convention dvd-ft-b (dvd dataset on books finetuned)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_lower_case=True)
config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
model = BertModel.from_pretrained('bert-base-uncased', config=config)
if(os.path.exists("./data")):
files = glob.glob('./data/*')
for f in files:
os.remove(f)
else:
os.makedirs("./data")
files = "./amazon-review/dvd-UL.csv"
dataset = pd.read_csv(files)
dataset = dataset.sample(n=1000, random_state=42)
dataset = dataset.reset_index(drop=True)
dictionary_list = []
count = 0
for i in tqdm(range(len(dataset))):
encoding = tokenizer(dataset["review_text"][i], return_tensors='pt', padding='max_length', truncation=True, max_length=180)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
outputs = model(input_ids, attention_mask)
hidden_states = outputs[2][1:]
batch_text = []
for j in range(0,12):
new_row = {'embeddings':hidden_states[j][0][0].cpu().detach().numpy(), 'layers': j+1}
dictionary_list.append(new_row)
dictionary_list = np.save(f"./data/batch_{count}", dictionary_list, allow_pickle=True)
dictionary_list = []
count += 1
HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))
dictionary_list = []
files = glob.glob("./data/*.npy")
for j in range(len(files)):
alpha = np.load(f"./data/batch_{j}.npy", allow_pickle = True)
for i in range(len(alpha)):
new_row = {'embeddings':alpha[i]["embeddings"], 'layers': alpha[i]["layers"]}
dictionary_list.append(new_row)
df = pd.DataFrame.from_dict(dictionary_list)
df.to_pickle("./dvd-pt.pkl") ##naming convention dvd-pt (dvd dataset on pretrain model)
df.layers.unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
df = pd.read_pickle("./books-ft-b.pkl")
df = pd.read_pickle("./books-pt.pkl")
mat = np.matrix([x for x in df.embeddings])
start = time.time()
pca = PCA(n_components=3, random_state=42)
components = pca.fit_transform(mat)
print(time.time()-start)
0.5051431655883789
fig.write_html("3D_PCA_FT.html")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-21-0e04dee09881> in <module> ----> 1 fig.write_html("3D_PCA.html") NameError: name 'fig' is not defined
fig = px.scatter_3d(
components, x=0, y=1, z=2, color=df['layers'],
title=f'Domain representation plotting PCA',
labels={'color': 'layers'}
)
fig.update_layout(
autosize=False,
width=1080,
height=720,)
fig.show()
fig = px.scatter_3d(
components, x=0, y=1, z=2, color=df['layers'],
title=f'Domain representation plotting PCA',
labels={'color': 'layers'}
)
fig.update_layout(
autosize=False,
width=1080,
height=720,)
fig.show()
df = pd.read_pickle("./dvd-pt.pkl")
df1 = pd.read_pickle("./dvd-ft-b.pkl")
def mmd(first_tensor: torch.Tensor, second_tensor: torch.tensor, kernel) -> float:
""" MMD will be calculated between two sets of tensors
Parameters
----------
first_tensor : torch.Tensor
Tensor of shape batch_size, m
m is the dimension of the first tensor
second_tensor: torch.Tensor
Tensor of shape batch_size, n
n is the dimension of the target tensor
Returns
-------
float
MMD between two samples
"""
first_tensor_correlation = kernel(first_tensor, first_tensor).evaluate()
second_tensor_correlation = kernel(second_tensor, second_tensor).evaluate()
first_second_tensor_correlation = kernel(first_tensor, second_tensor).evaluate()
m = first_tensor.size(0)
n = second_tensor.size(0)
sum_first_corr = first_tensor_correlation.sum().item()
sum_second_corr = second_tensor_correlation.sum().item()
sum_first_second_corr = first_second_tensor_correlation.sum().item()
first_term = (1 / (m ** 2)) * sum_first_corr
second_term = (1 / (n ** 2)) * sum_second_corr
third_term = (2 / (m * n)) * sum_first_second_corr
divergence = first_term + second_term - third_term
return divergence
def closest_rows(a):
# Get euclidean distances as 2D array
dists = cdist(a, a, 'cosine')
# Fill diagonals with something greater than all elements as we intend
# to get argmin indices later on and then index into input array with those
# indices to get the closest rows
# dists.ravel()[::dists.shape[1]+1] = dists.max()+1
return dists
pretrained_books_dvd_RSA = {}
pretrained_books_dvd_MMD = {} #for storing MMD between 2 models for each layer
for i in tqdm(range(1,13)):
temp1 = df[df["layers"]==i]
temp2 = df1[df1["layers"]==i] #selecting a specific layer
lis1 = temp1["embeddings"].tolist()
lis2 = temp2["embeddings"].tolist()
a = np.array(lis1)
b = np.array(lis2)
result1 = closest_rows(a)
result2 = closest_rows(b)
pretrained_books_dvd_RSA[i] = np.corrcoef(result1.flatten(), result2.flatten())[0,1]
result1 = torch.from_numpy(result1)
result2 = torch.from_numpy(result2)
kernel = RQKernel()
val = mmd(result1, result2, kernel)
pretrained_books_dvd_MMD[i] = val
HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))
RSA between bert representation and fine tuned bert (books) - DVD dataset
{1: 0.955398855024296, 2: 0.9499259151146981, 3: 0.9346616069127789, 4: 0.9586915720754712, 5: 0.9568238275750086, 6: 0.9182314339773345, 7: 0.8438659922811926, 8: 0.8498103209022732, 9: 0.8472093680343032, 10: 0.5935006317709525, 11: 0.37255983521918024, 12: 0.15755181885614458}
RSA between bert representation and fine tuned bert (books) - books dataset
{1: 0.9527077794274325, 2: 0.9477798799964473, 3: 0.9226848173618823, 4: 0.9463699218593898, 5: 0.9197475682544565, 6: 0.8904403027804, 7: 0.8978173700287129, 8: 0.8799731485650455, 9: 0.8353729201603369, 10: 0.5456118638478861, 11: 0.33934223629028293, 12: 0.1056740137191123}
MMD between bert representation and fine tuned bert (books) - DVD dataset
{1: 0.025523517691246278,
2: 0.017031062632806515,
3: 0.034405121243115344,
4: 0.0743630493421803,
5: 0.0658974319876604,
6: 0.21873141347150904,
7: 0.23004184150353701,
8: 0.16440297895429068,
9: 0.11399606384908612,
10: 0.2680937669944564,
11: 0.31493303470159223,
12: 0.397320439848271}
MMD between bert representation and fine tuned bert (books) - books dataset
{1: 0.008592661067214902,
2: 0.008466682016583071,
3: 0.01607070255527243,
4: 0.03613209326039435,
5: 0.059290986788250954,
6: 0.0553308317517649,
7: 0.0686747228871778,
8: 0.060072450953288725,
9: 0.09658271662444762,
10: 0.2985128726786279,
11: 0.3239971706890749,
12: 0.4579824415026016}