#!/usr/bin/env python # coding: utf-8 # #

# # ## [mlcourse.ai](https://mlcourse.ai) – Open Machine Learning Course # ### Author: I_Kalininskii, Kiavip # # ## Tutorial # ### "Constructing simple Chatbot using **spaCy** - Industrial-Strength Natural Language Processing" # # This tutorial based on works of Shirish Kadam (Adam https://shirishkadam.com/tag/spacy/ ) and Parul Pandey (Robo https://medium.com/analytics-vidhya/building-a-simple-chatbot-in-python-using-nltk-7c8c8215ac6e). With respect to the authors, code used only to show spaCy in action. # In the words of *Matthew Honnibal* (author of spaCy); # # ” There’s a real philosophical difference between spaCy and NLTK. spaCy is written to help you get things done. It’s minimal and opinionated. We want to provide you with exactly one way to do it — the right way. In contrast, NLTK was created to support education. Most of what’s there is for demo purposes, to help students explore ideas. spaCy provides very fast and accurate syntactic analysis (the fastest of any library released), and also offers named entity recognition and ready access to word vectors. You can use the default word vectors, or replace them with any you have. # # What really sets it apart, though, is the API. spaCy is the only library that has all of these features together, and allows you to easily hop between these levels of representation. Here’s an example of how that helps. Tutorial: Search Reddit for comments about Google doing something . spaCy also ensures that the annotations are always aligned to the original string, so you can easily print mark-up: Tutorial: Mark all adverbs, particularly for verbs of speech . “ # # To install spaCy you may use simple commands: # # **pip install --trusted-host pypi.org spacy** # # or, maybe: # # **pip install --trusted-host pypi.org -U spacy** # # And then you need to download core and some models, which you will be able to use in your projects: # # **python -m spacy download en** # # **python -m spacy download en_core_web_sm** # # **python -m spacy download en_core_web_md** # # **python -m spacy download en_core_web_lg** # # **python -m spacy download en_vectors_web_lg** # # First *("en")* is a basic module to work with English language. # # Second *("en_core_web_sm")* is an english multi-task Convolucional Neural Network (CNN) trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities. This one I shall use. # # Third *("en_core_web_md")* is an english multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities. This one brings powerful extensions. # # Fourth *("en_core_web_lg")* is an English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities. This model is really huge and complicated. # # Fifth *("en_vectors_web_lg")* is a 300-dimensional word vectors trained on Common Crawl with GloVe. This model is especially useful when it came to deep meaning of words due to large number of vectors: more than a million! # # Of course, spaCy supports other languages: German, Spanish, Portuguese, French, Italian and Dutch. But I will use only English. # To show the way spaCy deals with Natural Language Processing, I'll implement simple chatbot. First, import spaCy to your notebook and load preferred model: # In[ ]: import random import spacy from spacy.matcher import PhraseMatcher # In[ ]: nlp = spacy.load("en_core_web_sm") # Now I'll introduce some text to spaCy # In[ ]: text = ( u"Tutorials must be written in English." u"As for programming language, only Python is allowed." ) doc = nlp(text) # Just to show you some basics, i'll perform two simple loops. First, split by sentences, which are spaCy's Span objects: # In[ ]: for sent in doc.sents: print(sent.text, sent.lemma_) # Then, split by words, which are spaCy's **Token** objects # In[ ]: for token in doc: print( token.i, token.text, token.lemma_, token.tag_, token.pos_, token.dep_, token.head.text, token.head.pos_, [child for child in token.children], ) # There are many important objects, but these are essentials) You may see, sentences are splitted correctly. Words are tokenized, marked as fine-grained part-of-speech *(token.tag_)*, coarse-grained part-of-speech *(token.pos_)* and syntactic dependency relation *(token.dep_)*. They can have head token and child tokens, which are Token objects too! # # spaCy has useful visualisation module displaCy. It can visualise dependency tree of a sentence, highlight some entitities and many more. # Now I'll implement some base functions. # # I need to find complements: there will be use of some dependencies: # # *xcomp*: An open clausal complement (xcomp) of a verb or an adjective is a predicative or clausal complement without its own subject. # # *amod*: An adjectival modifier of a noun is any adjectival phrase that serves to modify the meaning of the noun. # # *ccomp*: A clausal complement of a verb or adjective is a dependent clause which is a core argument. That is, it functions like an object of the verb, or adjective. # # *acomp*: An adjectival complement of a verb is an adjectival phrase which functions as the complement # In[ ]: def get_root_phrase(token, keywords): """ Get all complements of a verb """ for child in token.children: if child.dep_ == any(["acomp", "xcomp", "ccomp"]): keywords.append(child.lemma_) return keywords # In[ ]: def get_adj_phrase(token, token_text): """ To fetch all the adjectives describing the noun """ for child in token.children: if ( child.dep_ == "amod" or child.dep_ == "acomp" or child.dep_ == "ccomp" ): # not for how many if child.text != "much" and child.text != "many": token_text = child.lemma_ + " " + token_text return token_text # In[ ]: def get_compound_nouns(en_doc, token, token_text): """ Recursively find the left and right compound nouns """ parent_token = token # If previous token is a compound noun while token.i > 0 and en_doc[token.i - 1].dep_ == "compound": token_text = en_doc[token.i - 1].text + " " + token_text token = en_doc[token.i - 1] # if the compound noun has any adjective modifier token_text = get_adj_phrase(token, token_text) token = parent_token # If next token is a compound noun while token.i < len(en_doc) - 1 and en_doc[token.i + 1].dep_ == "compound": token_text = token_text + " " + en_doc[token.i + 1].text token = en_doc[token.i + 1] # if the compound noun has any adjective modifier token_text = get_adj_phrase(token, token_text) # NOTE: Can token.shape_ == Xxxx... or XXXX... token.ent_iob_ help us here ...? return token_text # In[ ]: def get_noun_chunk(sentence, en_doc, keywords): """ Parse given sentense and return keywords incrementally """ root_word = "" for token in sentence: # If is Noun/Proper Noun, be it Singular or Plural if ( token.tag_ == "NN" or token.tag_ == "NNP" or token.tag_ == "NNPS" or token.tag_ == "NNS" ): # If the Noun itself is not a compound Noun then we can find its compound Nouns if token.dep_ != "compound": token_text = get_compound_nouns(en_doc, token, token.text) keywords.append(token_text) if token.tag_ == "JJ" and token.dep_ == "attr": token_text = get_compound_nouns(en_doc, token, token.text) token_text = get_adj_phrase(token, token_text) keywords.append(token_text) # If is a Cardinal Number & dependency is numeric modifier # nummod : A numeric modifier of a noun is any number phrase that # serves to modify the meaning of the noun with a quantity. if token.dep_ == "nummod" or token.tag_ == "CD": token_text = token.text if token.i > 0: # If previous token is Adjective, the adjective is liked with the cardinal number if en_doc[token.i - 1].tag_ == "JJ": token_text = en_doc[token.i - 1].text + " " + token.text if token.i < len(en_doc) - 1: # If next token is Adjective if en_doc[token.i + 1].tag_ == "JJ": token_text = token.text + " " + en_doc[token.i + 1].text keywords.append(token_text) # Extracts the root word of sentence if token.dep_ == "ROOT": root_word = token.lemma_ keywords = get_root_phrase(token, keywords) return root_word, keywords # **extract_features**(*sentence_type, en_doc*) is the main procedure of sentence parsing. sentence_type will be used lately to determine, if it is question, argument, command or another construction. # In[ ]: def extract_features(sentence_type, en_doc): """ Extract keywords, sentence_type argument isn't implemented yet( """ keywords = [] for sentence in en_doc.sents: root, keywords = get_noun_chunk(sentence, en_doc, keywords) keywords.append(root) return keywords # Parse given sentence. # In[ ]: def get_sentence_doc(sentence): sentence_doc = nlp(u"" + sentence) return sentence_doc # It's not implemented, but bot should understand, if it is a question or a statement, or, maybe, useless text # In[ ]: def classify_sentence(sentence_doc): return None # Sentence processing chain. # In[ ]: def process_sentence(sentence): """ get strait object """ sentence_doc = get_sentence_doc(sentence) sentence_class = classify_sentence(sentence_doc) sentence_keywords = extract_features(sentence_class, sentence_doc) matcher_map = construct_matcher(sentence_keywords, sentence_doc) return matcher_map # In[ ]: class MatcherMap: """ This is class is created to build spaCy Matcher or PraseMatcher [0] - Features [1] - Conjunctions (nested list with the conjunct and coordinating conjunction) [2] - Negations [3] - Markers """ __constructed_matcher__ = [None] * 4 coordinating_conjuncts = [] def __init__(self, ip_matcher=None): if ip_matcher is not None and len(ip_matcher) == 4: self.__constructed_matchery__ = ip_matcher def add_features(self, feature_list): self.__constructed_matcher__[0] = feature_list def add_conjunctions(self, conjunction_list): self.__constructed_matcher__[1] = conjunction_list def add_coordinating_conjunct(self, c_conjunct): self.coordinating_conjuncts.append(c_conjunct) def add_negations(self, negation_list): self.__constructed_matcher__[2] = negation_list def add_markers(self, marker_list): self.__constructed_matcher__[3] = marker_list def get_constructed_qery(self): return self.__constructed_matcher__ def get_features(self): return self.__constructed_matcher__[0] def get_conjunctions(self): return self.__constructed_matcher__[1] def get_negations(self): return self.__constructed_matcher__[2] def get_markers(self): return self.__constructed_matcher__[3] def __repr__(self): return ( "{Features: " + str(self.__constructed_matcher__[0]) + " ," "Conjunction: " + str(self.__constructed_matcher__[1]) + " ," "Negations: " + str(self.__constructed_matcher__[2]) + " ," "Marker: " + str(self.__constructed_matcher__[3]) + "}" ) # In[ ]: def get_conjuncts(token): """ A conjunct is the relation between two elements connected by a coordinating conjunction, such as and, or, etc. We treat conjunctions asymmetrically: The head of the relation is the first conjunct and all the other conjuncts depend on it via the conj relation. Coordinating Conjunction: and, or, but, yet, so, nor, for. Correlative Conjunctions: either...or, whether...or, not only...but also """ parent = token.head conj = [parent.text] for child in parent.children: if child.dep_ == "conj": conj.append(child.text) return conj # This function constructs lists of features and relations for future Matchers # In[ ]: def get_matcher(sentence, feature_list): """ This function sequentially adds the query components to the structured query. """ matcher_map = MatcherMap() matcher_map.add_features(feature_list) conjunct_list = [] neg_list = [] mark_list = [] for token in sentence: # cc: A cc is the relation between a conjunct and a preceding coordinating conjunction. if token.dep_ == "cc": conjunct_list.append(get_conjuncts(token)) conjunct_list.append(token.text) matcher_map.add_coordinating_conjunct(token.text) # neg: The negation modifier is the relation between a negation word and the word it modifies. if token.dep_ == "neg": if token.i > token.head.i: neg_list.append([token.text, token.head.text]) else: neg_list.append([token.head.text, token.text]) # mark: A marker is the word introducing a finite clause subordinate to another clause. if token.dep_ == "mark": if token.i > token.head.i: mark_list.append([token.text, token.head.text]) else: mark_list.append([token.head.text, token.text]) matcher_map.add_conjunctions(conjunct_list) matcher_map.add_negations(neg_list) matcher_map.add_markers(mark_list) return matcher_map # Each sentence will be processed to the essential keywords and relations. # In[ ]: def construct_matcher(features_list, en_doc): matcher_constructed_obj = [] for sentence in en_doc.sents: matcher_constructed_obj.append(get_matcher(sentence, features_list)) return matcher_constructed_obj # **spaCy** features a rule-matching engine, **the Matcher**, that operates over tokens, similar to regular expressions. The rules can refer to token annotations (e.g. the **token** text or tag_, and flags (e.g. IS_PUNCT). The rule matcher also lets you pass in a custom callback to act on matches – for example, to merge entities and apply custom labels. You can also associate patterns with entity IDs, to allow some basic entity linking or disambiguation. To match large terminology lists, you can use the **PhraseMatcher**, which accepts **Doc** objects as match patterns. # # Here is **spaCy Rule-Based matcher** comes in action! Features should be added as rules. And, of course, **PhraseMatcher** and **Matcher** are versatile tools, so you can set them to do some complex searching. I can implement only simplest Matcher, because deadline is so close. # In[ ]: def get_answer(doc, features): """ process given text and find features """ nlp_features = [] answer_list = [] matcher = PhraseMatcher(nlp.vocab) for f in features: for word in f.get_features(): nlp_features.append(nlp(word)) matcher.add("LEMMA", None, *nlp_features) matches = matcher(doc) for sent in doc.sents: for match_id, m_start, m_end in matches: if (sent.start <= m_end) & (sent.end >= m_start): answer_list.append(sent.text) return "".join(set(answer_list)) # Function returning simple responce # In[ ]: def response(user_response): bot_response = "{0}: ".format(bot_name) features = process_sentence(user_response) answer = get_answer(doc, features) if (answer == None) | (answer == ""): bot_response += "I am sorry! I don't understand you" return bot_response else: bot_response = bot_response + answer return bot_response # Add some greetings for the bot to be polite # In[ ]: GREETING_INPUTS = ( "hello", "hi", "greetings", "sup", "what's up", "hey", ) GREETING_RESPONSES = [ "hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me", ] bot_name = "spaCy lil bot" def greeting(sentence): for word in sentence.split(): if word.lower() in GREETING_INPUTS: return random.choice(GREETING_RESPONSES) continue # Here is the main loop. As you can see, this little bot can answer only a simple question, but is can grow to a smart assistant. # In[ ]: flag = True print( "{0}: I will answer your questions about loaded text. If you want to exit, type Bye!".format( bot_name ) ) while flag == True: user_response = input() user_response_lwr = user_response.lower() if user_response_lwr != "bye": if user_response_lwr == "thanks" or user_response_lwr == "thank you": flag = False print("{0}: You are welcome..".format(bot_name)) else: if greeting(user_response_lwr) != None: print("{0}: ".format(bot_name) + greeting(user_response_lwr)) else: print(response(user_response)) else: flag = False print("{0}: Bye! don't forget to upvote me..".format(bot_name)) # Just load some big model and you can predict similariries! Anyhow, it may or may not help you. # # **spaCy** ready to accept almost any word2vec sets to enhance it's possibilities. # In[ ]: nlp_lg = spacy.load("en_core_web_lg") # In[ ]: def get_related(word): filtered_words = [ w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15 ] similarity = sorted(filtered_words, key=lambda w: word.similarity(w), reverse=True) return similarity[1:11] # In[ ]: print([w.lower_ for w in get_related(nlp_lg.vocab[u"Russian"])]) # If you or me provide little bot with additional knowledge and algorithms, it can works with maps, construct schedule or read the books for you. This one can be the base of speech-recognition engine. Vectors and similarity checks may enhance search and provide nearly endless conversational possibilities!