This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/OATCgQtNX2o?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
import torch
from transformers import AutoTokenizer, AutoModel
sentences = [
"I took my dog for a walk",
"Today is going to rain",
"I took my cat for a walk",
]
model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
model_output = model(**encoded_input)
token_embeddings = model_output.last_hidden_state
print(f"Token embeddings shape: {token_embeddings.size()}")
import torch.nn.functional as F
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output.last_hidden_state
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
# Normalize the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
sentence_embeddings = sentence_embeddings.detach().numpy()
scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))
for idx in range(sentence_embeddings.shape[0]):
scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]
from datasets import load_dataset
squad = load_dataset("squad", split="validation").shuffle(seed=42).select(range(100))
def get_embeddings(text_list):
encoded_input = tokenizer(
text_list, padding=True, truncation=True, return_tensors="pt"
)
encoded_input = {k: v for k, v in encoded_input.items()}
with torch.no_grad():
model_output = model(**encoded_input)
return mean_pooling(model_output, encoded_input["attention_mask"])
squad_with_embeddings = squad.map(
lambda x: {"embeddings": get_embeddings(x["context"]).cpu().numpy()[0]}
)
squad_with_embeddings.add_faiss_index(column="embeddings")
question = "Who headlined the halftime show for Super Bowl 50?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = squad_with_embeddings.get_nearest_examples(
"embeddings", question_embedding, k=3
)