Notebook

Analyzing Multilingual French and Russian Text using NLTK, spaCy, and Stanza¶

Sample Code and Exercises¶

In [ ]:

war_and_peace = """
— Eh bien, mon prince. Gênes et Lucques ne sont plus que des apanages, des поместья, de la famille Buonaparte. Non, je vous préviens, que si vous ne me dites pas, que nous avons la guerre, si vous vous permettez encore de pallier toutes les infamies, toutes les atrocités de cet Antichrist (ma parole, j’y crois) — je ne vous connais plus, vous n’êtes plus mon ami, vous n’êtes plus мой верный раб, comme vous dites. Ну, здравствуйте, здравствуйте. Je vois que je vous fais peur, садитесь и рассказывайте.

Так говорила в июле 1805 года известная Анна Павловна Шерер, фрейлина и приближенная императрицы Марии Феодоровны, встречая важного и чиновного князя Василия, первого приехавшего на ее вечер. Анна Павловна кашляла несколько дней, у нее был грипп, как она говорила (грипп был тогда новое слово, употреблявшееся только редкими). В записочках, разосланных утром с красным лакеем, было написано без различия во всех:

«Si vous n’avez rien de mieux à faire, M. le comte (или mon prince), et si la perspective de passer la soirée chez une pauvre malade ne vous effraye pas trop, je serai charmée de vous voir chez moi entre 7 et 10 heures. Annette Scherer».

— Dieu, quelle virulente sortie! — отвечал, нисколько не смутясь такою встречей, вошедший князь, в придворном, шитом мундире, в чулках, башмаках, и звездах, с светлым выражением плоского лица.

Он говорил на том изысканном французском языке, на котором не только говорили, но и думали наши деды, и с теми тихими, покровительственными интонациями, которые свойственны состаревшемуcя в свете и при дворе значительному человеку. Он подошел к Анне Павловне, поцеловал ее руку, подставив ей свою надушенную и сияющую лысину, и покойно уселся на диване.

— Avant tout dites moi, comment vous allez, chère amie? Успокойте меня, — сказал он, не изменяя голоса и тоном, в котором из-за приличия и участия просвечивало равнодушие и даже насмешка.
"""

Loading and Preparing the Text¶

In [ ]:

with open("war_and_peace_excerpt.txt") as file:
    war_and_peace = file.read()
    print(war_and_peace)

In [ ]:

cleaned_war_and_peace = war_and_peace.replace("\n", " ")
print(cleaned_war_and_peace)

In [ ]:

!pip install nltk
!pip install spacy
!pip install stanza

In [ ]:

import nltk
import spacy
import stanza

Tokenization¶

In [ ]:

# first, let's install the 'punkt' resources required to use the tokenizer
import nltk
nltk.download('punkt')

# then we import the sent_tokenize method and apply it to our war_and_peace variable
from nltk.tokenize import sent_tokenize
nltk_sent_tokenized = sent_tokenize(cleaned_war_and_peace)
# if you were going to specify a language, the following syntax would be used: nltk_sent_tokenized = sent_tokenize(war_and_peace, language="russian")

In [ ]:

# printing the Russian sentence at index 5 in our list of sentence
rus_sent = nltk_sent_tokenized[5]
print('Russian: ' + rus_sent)

# printing the French sentence at index 2
fre_sent = nltk_sent_tokenized[13]
print('French: ' + fre_sent)

# printing the sentence in both French and Russian at index 4
multi_sent = nltk_sent_tokenized[4]
print('Multilang: ' + multi_sent)

In [ ]:

# printing each sentence in our list
for sent in nltk_sent_tokenized:
  print(sent)

In [ ]:

# downloading our multilingual sentence tokenizer
python -m spacy download xx_sent_ud_sm

# loading the multilingual sentence tokenizer we just downloaded
nlp = spacy.load("xx_sent_ud_sm")
# applying the spaCy model to our text variable
doc = nlp(cleaned_war_and_peace)

# assigning the tokenized sentences to a list so it's easier for us to manipulate them later
spacy_sentences = list(doc.sents)

# printing the sentences to our console
print(spacy_sentences)

In [ ]:

# concatenating the Russian sentence and its language label
spacy_rus_sent = str(spacy_sentences[5])
print('Russian: ' + spacy_rus_sent)

# concatenating the French sentence and its language label
spacy_fre_sent = str(spacy_sentences[13])
print('French: ' + spacy_fre_sent)

# concatenating the French and Russian sentence and its label
spacy_multi_sent = str(spacy_sentences[4])
print('Multilang: ' + spacy_multi_sent)

In [ ]:

from stanza.pipeline.multilingual import MultilingualPipeline

# setting up our tokenizer pipeline
nlp = MultilingualPipeline(processors='tokenize')

# applying the pipeline to our text
doc = nlp(cleaned_war_and_peace)

# printing all sentences to see how they tokenized
print([sentence.text for sentence in doc.sentences])

In [ ]:

# creating an empty list to append our sentences to
stanza_sentences = []

# iterating through the sentence tokens created by the tokenizer pipeline and appending to the list
for sentence in doc.sentences:
  stanza_sentences.append(sentence.text)

# printing our sentence that is only in Russian
stanza_rus_sent = str(stanza_sentences[5])
print('Russian: ' + stanza_rus_sent)

# printing our sentence that is only in French
stanza_fre_sent = str(stanza_sentences[12])
print('French: ' + stanza_fre_sent)

# printing our sentence in both French and Russian
stanza_multi_sent = str(stanza_sentences[4])
print('Multilang: ' + stanza_multi_sent)

Automatically Detecting Different Languages¶

In [ ]:

# downloading an NLTK corpus reader required by the TextCat module
nltk.download('crubadan')

# loading the TextCat package and applying it to each of our sentences
tcat = nltk.classify.textcat.TextCat()
rus_estimate = tcat.guess_language(rus_sent)
fre_estimate = tcat.guess_language(fre_sent)
multi_estimate = tcat.guess_language(multi_sent)

# printing the results
print('Russian estimate: ' + rus_estimate)
print('French estimate: ' + fre_estimate)
print('Multilingual estimate: ' + multi_estimate)

In [ ]:

# first, we install the spacy_langdetect package from the Python Package Index
pip install spacy_langdetect

# then we import it and use it to detect our languages
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# setting up our language detector to work with spaCy
# def get_lang_detector(nlp, name):
#     return LanguageDetector()

# setting up our pipeline
Language.factory("language_detector")
nlp.add_pipe('language_detector', last=True)

# running the language detection on each sentence and printing the results
rus_doc = nlp(spacy_rus_sent)
print(rus_doc._.language)

fre_doc = nlp(spacy_fre_sent)
print(fre_doc._.language)

multi_doc = nlp(spacy_multi_sent)
print(multi_doc._.language)

In [ ]:

# importing our models required for language detection
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline

# setting up our pipeline
nlp = Pipeline(lang="multilingual", processors="langid")

# specifying which sentences to run the detection on, then running the detection code
docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
docs = [Document([], text=text) for text in docs]
nlp(docs)

# printing the text of each sentence alongside the language estimates
print("\n".join(f"{doc.text}\t{doc.lang}" for doc in docs))

In [ ]:

# first, we split the sentence into its component words using the wordpunct_tokenize module
from nltk.tokenize import wordpunct_tokenize
tokenized_sent = wordpunct_tokenize(multi_sent)

In [ ]:

# importing the regex package so we can use a regular expression
import regex
# importing the string package to detect punctuation
from string import punctuation

# setting empty lists we will later populate with our words
cyrillic_words = []
latin_words = []

In [ ]:

for word in tokenized_sent:
  if word in punctuation:
    continue
  else:
    if regex.search(r'\p{IsCyrillic}', word):
      cyrillic_words.append(word)
    else:
        latin_words.append(word)


print(cyrillic_words)
print(latin_words)

In [ ]:

# joining the lists into a string, with each word separated by a space (' ')
cyrillic_only_list = ' '.join(cyrillic_words)
latin_only_list = ' '.join(latin_words)

# now we use TextCat again to detect their languages
tcat = nltk.classify.textcat.TextCat()
multi_estimate_1 = tcat.guess_language(cyrillic_only_list)
multi_estimate_2 = tcat.guess_language(latin_only_list)

# printing our estimates
print('Cyrillic estimate: ' + multi_estimate_1)
print('Latin estimate: ' + multi_estimate_2)

Part-of-Speech Tagging¶

In [ ]:

# downloading our Russian model from spaCy
python -m spacy download ru_core_news_sm


# loading the model
nlp = spacy.load("ru_core_news_sm")

# applying the model
doc = nlp(spacy_rus_sent)

# printing the text of each word and its POS tag
for token in doc:
    print(token.text, token.pos_)

In [ ]:

# downloading our French model from spaCy
python -m spacy download fr_core_news_sm


# loading the corpus
nlp = spacy.load("fr_core_news_sm")

# applying the model
doc = nlp(spacy_fre_sent)

# printing the text of each word and its POS tag
for token in doc:
    print(token.text, token.pos_)

In [ ]:

# creating our blank lists to append to later
cyrillic_words_punct = []
latin_words_punct = []

# initializing a blank string to keep track of the last list we appended to
last_appended_list = ''

# iterating through our words and appending based on whether a Cyrillic character was detected
for word in tokenized_sent:
  if regex.search(r'\p{IsCyrillic}', word):
    cyrillic_words_punct.append(word)
    # updating our string to track the list we appended a word to
    last_appended_list = 'cyr'
  else:
    # handling punctuation by appending it to our most recently used list
    if word in punctuation:
        if last_appended_list == 'cyr':
            cyrillic_words_punct.append(word)
        elif last_appended_list == 'lat':
            latin_words_punct.append(word)
    else:
        latin_words.append(word)
        last_appended_list = 'lat'

print(cyrillic_words)
print(latin_words)

In [ ]:

# joining the lists to strings
cyrillic_only_list = ' '.join(cyrillic_words)
latin_only_list = ' '.join(latin_words)

# using our regular expression to remove extra whitespace before the punctuation marks
cyr_no_extra_space = regex.sub(r'\s([?.!,"](?:\s|$))', r'\1', cyrillic_only_list)
lat_no_extra_space = regex.sub(r'\s([?.!,"](?:\s|$))', r'\1', latin_only_list)

# checking the results of the regular expression above
print(cyr_no_extra_space)
print(lat_no_extra_space)

In [ ]:

# loading and applying the model
nlp = spacy.load("ru_core_news_sm")
doc = nlp(cyr_no_extra_space)

# printing the text of each word and its POS tag
for token in doc:
    print(token.text, token.pos_)

# and doing the same with our French sentence
nlp = spacy.load("fr_core_news_sm")
doc = nlp(lat_no_extra_space)
for token in doc:
    print(token.text, token.pos_)
```

In [ ]:

# loading our pipeline and applying it to our sentence, specifying our language as Russian ('ru')
nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos')
doc = nlp(stanza_rus_sent)

# printing our words and POS tags
print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')

In [ ]:

# loading our pipeline and applying it to our sentence, specifying our language as French ('fr')
nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt,pos')
doc = nlp(stanza_fre_sent)

# printing our words and POS tags
print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')

In [ ]:

# imports so we can use Stanza's MultilingualPipeline
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously
nlp = MultilingualPipeline(processors='tokenize,pos')
docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
nlp(docs)

# printing the results
print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')

Lemmatization¶

In [ ]:

# loading and applying our Russian model
nlp = spacy.load("ru_core_news_sm")
doc = nlp(spacy_rus_sent)


# printing each word alongside its lemma
for token in doc:
    print(token, token.lemma_)

# loading and applying our French model
nlp = spacy.load("fr_core_news_sm")
doc = nlp(spacy_fre_sent)

# again printing each word alongside its lemma
for token in doc:
    print(token, token.lemma_)

In [ ]:

# loading and applying the model
nlp = spacy.load("ru_core_news_sm")
doc = nlp(cyr_no_extra_space)

# printing the words and their lemmas
for token in doc:
    print(token, token.lemma_)

In [ ]:

# loading and applying the model
nlp = spacy.load("fr_core_news_sm")
doc = nlp(lat_no_extra_space)

# printing the words and their lemmas
for token in doc:
    print(token, token.lemma_)

In [ ]:

# imports so we can run the multilingual pipeline
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.pipeline.multilingual import MultilingualPipeline

# adding the 'lemma' processor to the pipeline and running it on our sentences
nlp = MultilingualPipeline(processors='tokenize,lemma')
docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
nlped_docs = nlp(docs)

# iterating through each sentence's words and printing the lemmas
for doc in nlped_docs:
  lemmas = [word.lemma for t in doc.iter_tokens() for word in t.words]
  print(lemmas)