A Whirlwind Tour of the 🤗 Hugging Face Ecosystem



Christopher Akiki



Figures in these slides reproduced under the Apache License from Natural Language Processing with Transformers published by O'Reilly Media, Inc.


🤗 Pipelines


# In[1]: from transformers import pipeline from transformers.pipelines import get_supported_tasks # In[5]: print(get_supported_tasks()) # #
# In[7]: text = """One of the best orchestra in the world. I came to Leipzig\ mainly to have one experience with Gewanhaus Leipzig Orchestra. Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. The acustic and layout of the concert hall is nice.""" # # Sentiment Analysis # In[11]: p = pipeline("text-classification", model='distilbert-base-uncased-finetuned-sst-2-english', device=-1) # In[9]: outputs = p(text) outputs[0] # # Named-Entity Recognition # In[12]: p = pipeline("ner", aggregation_strategy="simple", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=-1) # In[13]: outputs = p(text) pd.DataFrame(outputs) # # Question Answering # In[23]: p = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=-1) # In[34]: questions = ['What city did I visit?', 'Why did I visit Leipzig?', 'What music did the orchestra play?', 'Who lead the orchestra?'] # In[42]: outputs = p(question=questions, context=text) with pd.option_context('display.max_colwidth', -1): display(pd.DataFrame(zip(questions, [o['answer'] for o in outputs]), columns=['Question', 'Answer'])) # # Translation # In[20]: p = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", device=-1) # In[21]: outputs = p(text, clean_up_tokenization_spaces=True) print(outputs[0]['translation_text']) #

🤗 Tokenizers

# In[43]: import nltk nltk.download('gutenberg') # In[44]: print(nltk.corpus.gutenberg.fileids()) # In[45]: moby_dick_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt') # In[46]: size = len(moby_dick_raw.encode()) print(f"{size/1024**2:.2f} MiB") # In[47]: from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer # In[48]: unk_token = "[UNK]" pad_token = "[PAD]" cls_token = "[CLS]" sep_token = "[SEP]" mask_token = "[MASK]" special_tokens = [unk_token, pad_token, cls_token, sep_token, mask_token] vocab_size = 6_000 # # WordPiece Tokenizer # In[49]: custom_tokenizer = Tokenizer(WordPiece(unk_token=unk_token)) # # Sequence of Normalizers # In[50]: custom_normalizer = normalizers.Sequence( [normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents()] ) # # Sequence of Pretokenizers # In[51]: custom_pre_tokenizer = pre_tokenizers.Sequence( [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()] ) # # WordPiece Trainer # In[52]: custom_trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens, show_progress=False) # In[53]: custom_tokenizer.normalizer = custom_normalizer custom_tokenizer.pre_tokenizer = custom_pre_tokenizer # In[54]: get_ipython().run_cell_magic('time', '', 'custom_tokenizer.train_from_iterator([moby_dick_raw], trainer=custom_trainer)\n') # In[55]: custom_tokenizer.get_vocab_size() # In[56]: encoding = custom_tokenizer.encode("Let us test this tokenizer") print(encoding.tokens) # In[58]: cls_token_id = custom_tokenizer.token_to_id(cls_token) sep_token_id = custom_tokenizer.token_to_id(sep_token) custom_post_processor = processors.TemplateProcessing( single=f"{cls_token}:0 $A:0 {sep_token}:0", pair=f"{cls_token}:0 $A:0 {sep_token}:0 $B:1 {sep_token}:1", special_tokens=[(cls_token, cls_token_id), (sep_token, sep_token_id)], ) custom_tokenizer.post_processor = custom_post_processor # In[59]: encoding = custom_tokenizer.encode("Let us test this tokenizer") print(encoding.tokens) # In[60]: encoding = custom_tokenizer.encode("This is the first sentence", "This is sentence number 2") print(encoding.tokens) print(encoding.ids) print(encoding.type_ids) # # Using our custom tokenizer with a model # In[61]: from transformers import PreTrainedTokenizerFast model_tokenizer = PreTrainedTokenizerFast( tokenizer_object=custom_tokenizer, unk_token=unk_token, pad_token=pad_token, cls_token=cls_token, sep_token=sep_token, mask_token=mask_token, ) # In[62]: text_batch = ["To be or not to be.", "It was the best of times.", "Call me Ishmael."] # In[63]: model_tokenizer(text_batch, padding=True, return_tensors="tf") #

🤗 Datasets

Data format Loading script Example
CSV & TSV csv load_dataset("csv", data_files="my_file.csv")
Text files text load_dataset("text", data_files="my_file.txt")
JSON & JSON Lines json load_dataset("json", data_files="my_file.jsonl")
Pickled DataFrames pandas load_dataset("pandas", data_files="my_dataframe.pkl")
# In[60]: import pandas as pd # In[61]: emotions.set_format(type="pandas") emotions_df = emotions['train'][:] # In[62]: emotions_df['label_name'] = emotions_df['label'].apply(lambda x: train_ds.features['label'].int2str(x)) # In[68]: emotions_df.head(10) # In[65]: emotions_df['label_name'].value_counts() # In[66]: emotions_df['text'].str.split().apply(len).describe() # In[ ]: emotions.reset_format() # In[ ]: train_ds #

🤗 Transformers

# In[ ]: from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DataCollatorWithPadding model_checkpoint = "distilbert-base-uncased" #
# # Transfer Learning via Finetuning #
# In[ ]: tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint) # In[ ]: tokenizer(["This is a test", "This is another test", "cat"], return_tensors="tf", padding=True, truncation=True) # In[ ]: def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=True) # In[ ]: tokenized_train_ds = train_ds.map(tokenize, batched=True, batch_size=None) tokenized_val_ds = emotions['validation'].map(tokenize, batched=True, batch_size=None) tokenizer_test_ds = emotions['test'].map(tokenize, batched=True, batch_size=None) # In[ ]: BATCH_SIZE = 64 data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") tf_train_dataset = tokenized_train_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator ) tf_val_dataset = tokenized_val_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator ) tf_test_dataset = tokenizer_test_ds.to_tf_dataset(columns=tokenizer.model_input_names, label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator ) # In[ ]: for i in tf_train_dataset.take(1): print(i) # In[ ]: model = TFDistilBertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=train_ds.features['label'].num_classes) # In[64]: get_ipython().run_line_magic('load_ext', 'tensorboard') get_ipython().run_line_magic('tensorboard', '--logdir /tf/model/logs --host') # In[ ]: from transformers.keras_callbacks import PushToHubCallback from tensorflow.keras.callbacks import TensorBoard tensorboard_callback = TensorBoard(log_dir="./model/logs") push_to_hub_callback = PushToHubCallback( output_dir="./model", tokenizer=tokenizer, hub_model_id=f"{model_checkpoint}-finetuned-tweet-sentiment", ) callbacks = [tensorboard_callback, push_to_hub_callback] # In[ ]: model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=tf.metrics.SparseCategoricalAccuracy() ) # In[ ]: history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=5, callbacks=callbacks) # In[ ]: _, accuracy = model.evaluate(tf_test_dataset) print(f"Test accuracy: {round(accuracy * 100, 2)}%") # In[ ]: p = pipeline("text-classification", model='cakiki/distilbert-base-uncased-finetuned-tweet-sentiment', device=-1) # In[ ]: p("I am terrified") # In[ ]: emotions['train'].features # In[ ]: import gradio as gr gr.Interface.load("huggingface/cakiki/distilbert-base-uncased-finetuned-tweet-sentiment").launch(share=True); #


