#!/usr/bin/env python # coding: utf-8 # In[70]: import transformers import pandas as pd import tensorflow as tf from huggingface_hub import notebook_login import os transformers.logging.set_verbosity_error() # In[71]: notebook_login() #

A Whirlwind Tour of the 🤗 Hugging Face Ecosystem

#

#

Christopher Akiki

#

#

Figures in these slides reproduced under the Apache License from Natural Language Processing with Transformers published by O'Reilly Media, Inc.

🤗 Pipelines

#

# In[1]: from transformers import pipeline from transformers.pipelines import get_supported_tasks # In[5]: print(get_supported_tasks()) # #

# In[7]: text = """One of the best orchestra in the world. I came to Leipzig\ mainly to have one experience with Gewanhaus Leipzig Orchestra. Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. The acustic and layout of the concert hall is nice.""" # # Sentiment Analysis # In[11]: p = pipeline("text-classification", model='distilbert-base-uncased-finetuned-sst-2-english', device=-1) # In[9]: outputs = p(text) outputs[0] # # Named-Entity Recognition # In[12]: p = pipeline("ner", aggregation_strategy="simple", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=-1) # In[13]: outputs = p(text) pd.DataFrame(outputs) # # Question Answering # In[23]: p = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=-1) # In[34]: questions = ['What city did I visit?', 'Why did I visit Leipzig?', 'What music did the orchestra play?', 'Who lead the orchestra?'] # In[42]: outputs = p(question=questions, context=text) with pd.option_context('display.max_colwidth', -1): display(pd.DataFrame(zip(questions, [o['answer'] for o in outputs]), columns=['Question', 'Answer'])) # # Translation # In[20]: p = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", device=-1) # In[21]: outputs = p(text, clean_up_tokenization_spaces=True) print(outputs[0]['translation_text']) #

🤗 Tokenizers

# In[43]: import nltk nltk.download('gutenberg') # In[44]: print(nltk.corpus.gutenberg.fileids()) # In[45]: moby_dick_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt') # In[46]: size = len(moby_dick_raw.encode()) print(f"{size/1024**2:.2f} MiB") # In[47]: from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer # In[48]: unk_token = "[UNK]" pad_token = "[PAD]" cls_token = "[CLS]" sep_token = "[SEP]" mask_token = "[MASK]" special_tokens = [unk_token, pad_token, cls_token, sep_token, mask_token] vocab_size = 6_000 # # WordPiece Tokenizer # In[49]: custom_tokenizer = Tokenizer(WordPiece(unk_token=unk_token)) # # Sequence of Normalizers # In[50]: custom_normalizer = normalizers.Sequence( [normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents()] ) # # Sequence of Pretokenizers # In[51]: custom_pre_tokenizer = pre_tokenizers.Sequence( [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()] ) # # WordPiece Trainer # In[52]: custom_trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens, show_progress=False) # In[53]: custom_tokenizer.normalizer = custom_normalizer custom_tokenizer.pre_tokenizer = custom_pre_tokenizer # In[54]: get_ipython().run_cell_magic('time', '', 'custom_tokenizer.train_from_iterator([moby_dick_raw], trainer=custom_trainer)\n') # In[55]: custom_tokenizer.get_vocab_size() # In[56]: encoding = custom_tokenizer.encode("Let us test this tokenizer") print(encoding.tokens) # In[58]: cls_token_id = custom_tokenizer.token_to_id(cls_token) sep_token_id = custom_tokenizer.token_to_id(sep_token) custom_post_processor = processors.TemplateProcessing( single=f"{cls_token}:0 $A:0 {sep_token}:0", pair=f"{cls_token}:0 $A:0 {sep_token}:0 $B:1 {sep_token}:1", special_tokens=[(cls_token, cls_token_id), (sep_token, sep_token_id)], ) custom_tokenizer.post_processor = custom_post_processor # In[59]: encoding = custom_tokenizer.encode("Let us test this tokenizer") print(encoding.tokens) # In[60]: encoding = custom_tokenizer.encode("This is the first sentence", "This is sentence number 2") print(encoding.tokens) print(encoding.ids) print(encoding.type_ids) # # Using our custom tokenizer with a model # In[61]: from transformers import PreTrainedTokenizerFast model_tokenizer = PreTrainedTokenizerFast( tokenizer_object=custom_tokenizer, unk_token=unk_token, pad_token=pad_token, cls_token=cls_token, sep_token=sep_token, mask_token=mask_token, ) # In[62]: text_batch = ["To be or not to be.", "It was the best of times.", "Call me Ishmael."] # In[63]: model_tokenizer(text_batch, padding=True, return_tensors="tf") #

🤗 Datasets

# # Apache Arrow backend ➡️ Low RAM use # #
# # ```python # import os; import psutil; import timeit # from datasets import load_dataset # # mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024) # wiki = load_dataset("wikipedia", "20220301.en", split="train") # mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024) # # print(f"RAM memory used: {(mem_after - mem_before)} MB") # # *****RAM memory used: 50 MB***** # ``` # # # Apache Arrow Backend ➡️ Fast Iteration #
# # ```python # s = """batch_size = 1000 # for i in range(0, len(wiki), batch_size): # batch = wiki[i:i + batch_size] # """ # time = timeit.timeit(stmt=s, number=1, globals=globals()) # print(f"Time to iterate over the {wiki.dataset_size >> 30}GB dataset: {time:.1f} sec, " # f"ie. {float(wiki.dataset_size >> 27)/time:.1f} Gb/s") # # *****Time to iterate over the 18 GB dataset: 70.5 sec, ie. 2.1 Gb/s***** # ``` # In[12]: from datasets import list_datasets, load_dataset # In[13]: all_datasets = list_datasets() # In[14]: len(all_datasets) # In[15]: [d for d in all_datasets if "emotion" in d] # In[21]: emotions = load_dataset("emotion") emotions # In[32]: print(emotions['train'].info.description) print(125*"*") print(emotions['train'].citation) # In[33]: train_ds = emotions["train"] train_ds # In[38]: train_ds.features['label'] # In[40]: train_ds.features['label'].int2str(5) # In[41]: len(train_ds) # In[44]: train_ds[11] # In[45]: train_ds[:10] # In[46]: train_ds[:10]['text'] # In[47]: def compute_tweet_length(row): return {"tweet_length": len(row['text'].split())} # In[48]: train_ds = train_ds.map(compute_tweet_length, load_from_cache_file=False) # In[ ]: train_ds.push_to_hub('emotion-with-length') # In[49]: train_ds.filter(lambda row: row['tweet_length'] < 25) # In[52]: train_ds.sort("tweet_length")[:10] # In[54]: def batched_compute_tweet_length(batch_of_rows): return {"tweet_length": [len(text.split()) for text in batch_of_rows['text']]} # In[55]: train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False) # In[56]: get_ipython().run_line_magic('time', 'train_ds.map(compute_tweet_length, load_from_cache_file=False)') # In[57]: get_ipython().run_line_magic('time', 'train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False, )') # In[58]: train_ds.column_names # In[59]: train_ds = train_ds.remove_columns('tweet_length') train_ds # # Loading your own files #

Data format	Loading script	Example
CSV & TSV	`csv`	`load_dataset("csv", data_files="my_file.csv")`
Text files	`text`	`load_dataset("text", data_files="my_file.txt")`
JSON & JSON Lines	`json`	`load_dataset("json", data_files="my_file.jsonl")`
Pickled DataFrames	`pandas`	`load_dataset("pandas", data_files="my_dataframe.pkl")`

# In[60]: import pandas as pd # In[61]: emotions.set_format(type="pandas") emotions_df = emotions['train'][:] # In[62]: emotions_df['label_name'] = emotions_df['label'].apply(lambda x: train_ds.features['label'].int2str(x)) # In[68]: emotions_df.head(10) # In[65]: emotions_df['label_name'].value_counts() # In[66]: emotions_df['text'].str.split().apply(len).describe() # In[ ]: emotions.reset_format() # In[ ]: train_ds #

🤗 Transformers

# In[ ]: from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DataCollatorWithPadding model_checkpoint = "distilbert-base-uncased" #

# # Transfer Learning via Feature Extraction (Homework) #

# # Transfer Learning via Finetuning #

# In[ ]: tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint) # In[ ]: tokenizer(["This is a test", "This is another test", "cat"], return_tensors="tf", padding=True, truncation=True) # In[ ]: def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=True) # In[ ]: tokenized_train_ds = train_ds.map(tokenize, batched=True, batch_size=None) tokenized_val_ds = emotions['validation'].map(tokenize, batched=True, batch_size=None) tokenizer_test_ds = emotions['test'].map(tokenize, batched=True, batch_size=None) # In[ ]: BATCH_SIZE = 64 data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") tf_train_dataset = tokenized_train_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator ) tf_val_dataset = tokenized_val_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator ) tf_test_dataset = tokenizer_test_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator ) # In[ ]: for i in tf_train_dataset.take(1): print(i) # In[ ]: model = TFDistilBertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=train_ds.features['label'].num_classes) # In[64]: get_ipython().run_line_magic('load_ext', 'tensorboard') get_ipython().run_line_magic('tensorboard', '--logdir /tf/model/logs --host 0.0.0.0') # In[ ]: from transformers.keras_callbacks import PushToHubCallback from tensorflow.keras.callbacks import TensorBoard tensorboard_callback = TensorBoard(log_dir="./model/logs") push_to_hub_callback = PushToHubCallback( output_dir="./model", tokenizer=tokenizer, hub_model_id=f"{model_checkpoint}-finetuned-tweet-sentiment", ) callbacks = [tensorboard_callback, push_to_hub_callback] # In[ ]: model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=tf.metrics.SparseCategoricalAccuracy() ) # In[ ]: history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=5, callbacks=callbacks) # In[ ]: _, accuracy = model.evaluate(tf_test_dataset) print(f"Test accuracy: {round(accuracy * 100, 2)}%") # In[ ]: p = pipeline("text-classification", model='cakiki/distilbert-base-uncased-finetuned-tweet-sentiment', device=-1) # In[ ]: p("I am terrified") # In[ ]: emotions['train'].features # In[ ]: import gradio as gr gr.Interface.load("huggingface/cakiki/distilbert-base-uncased-finetuned-tweet-sentiment").launch(share=True); #

(Re)sources

# - https://github.com/nlp-with-transformers/notebooks # # - https://huggingface.co/docs # # - https://github.com/huggingface/course / https://github.com/huggingface/notebooks # # - https://github.com/NielsRogge/Transformers-Tutorials # #

# In[ ]: