#!/usr/bin/env python # coding: utf-8 # In[1]: # Reveal.js from notebook.services.config import ConfigManager cm = ConfigManager() cm.update('livereveal', { 'theme': 'white', 'transition': 'none', 'controls': 'false', 'progress': 'true', }) # In[2]: get_ipython().run_cell_magic('capture', '', '%load_ext autoreload\n%autoreload 2\n# %cd ..\nimport sys\nsys.path.append("..")\nimport statnlpbook.util as util\nutil.execute_notebook(\'language_models.ipynb\')\n') # In[3]: get_ipython().run_cell_magic('html', '', '\n
\n') # In[4]: from IPython.display import Image import random # # Contextualised Word Representations # # # ## What makes a good word representation? ## # # 1. Representations are **distinct** # 2. **Similar** words have **similar** representations # ## What does this mean? ## # # # * "Yesterday I saw a bass ..." # In[5]: Image(url='../img/bass_1.jpg'+'?'+str(random.random()), width=300) # In[6]: Image(url='../img/bass_2.svg'+'?'+str(random.random()), width=100) # # Contextualised Representations # # # * Static embeddings (e.g., [word2vec](dl-representations_simple.ipynb)) have one representation per word *type*, regardless of context # # * Contextualised representations use the context surrounding the word *token* # # ## Contextualised Representations Example ## # # # * a) "Yesterday I saw a bass swimming in the lake" # In[7]: Image(url='../img/bass_1.jpg'+'?'+str(random.random()), width=300) # * b) "Yesterday I saw a bass in the music shop" # In[8]: Image(url='../img/bass_2.svg'+'?'+str(random.random()), width=100) # ## Contextualised Representations Example ## # # # * a) "Yesterday I saw a bass swimming in the lake". # * b) "Yesterday I saw a bass in the music shop". # In[9]: Image(url='../img/bass_visualisation.jpg'+'?'+str(random.random()), width=500) # ## What makes a good representation? ## # # 1. Representations are **distinct** # 2. **Similar** words have **similar** representations # Additional criterion: # # 3. Representations take **context** into account # ## How to train contextualised representations ## # # Basicallly like word2vec: predict a word from its context (or vice versa). # # Cannot just use lookup table (i.e., embedding matrix) any more. # # Train a network with the sequence as input! Does this remind you of anything? #
# # The hidden state of an RNN LM is a contextualised word representation! # In[9]: Image(url='../img/elmo_1.png'+'?'+str(random.random()), width=800) # "Let's stick to improvisation in this skit" # # Image credit: http://jalammar.github.io/illustrated-bert/ # ## Bidirectional RNN LM ## # # An RNN (or LSTM) LM only considers preceding context. # # ELMo (Embeddings from Language Models) is based on a biLM: *bidirectional language model* ([Peters et al., 2018](https://www.aclweb.org/anthology/N18-1202/)). # In[6]: Image(url='../img/elmo_2.png'+'?'+str(random.random()), width=1200) # In[10]: Image(url='../img/elmo_3.png'+'?'+str(random.random()), width=1200) #
# # # [ucph.page.link/bilm](https://ucph.page.link/bilm) # ([Responses](https://docs.google.com/forms/d/1BimPo-S12XWt1qOJLXBTIGjRpt-bVW8H7hmT3j0iRRQ/edit#responses)) # ### Solution # # To prevent a word from being used to predict itself, while still allowing the model to consider both preceding and following words. # ## Problem: Long-Term Dependencies ## # # LSTMs have *longer-term* memory, but they still forget. # # Solution: *transformers*! ([Vaswani et al. (2017)](https://arxiv.org/abs/1706.03762)) # * In 2022, all state-of-the-art LMs are transformers. # * Yes, also GPT-3 # In[15]: Image(url='../img/transformers.png'+'?'+str(random.random()), width=400) # # Summary # # # * Static word embeddings do not differ depending on context # * Contextualised representations are dynamic # # Additional Reading # # # + [Jurafsky & Martin Chapter 11](https://web.stanford.edu/~jurafsky/slp3/11.pdf) # In[ ]: