#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # ## Keyphrase Extraction in `ktrain` # # Keyphrase extraction in **ktrain** leverages the [textblob](https://textblob.readthedocs.io/en/dev/) package, which can be installed with: # ``` # pip install textblob tika # python -m textblob.download_corpora # ``` # In[2]: from ktrain.text.kw import KeywordExtractor from ktrain.text.textextractor import TextExtractor # ### Download a Paper from ArXiv and Extract Text # For our test document, let's download the ktrain ArXiv paper and use the `TextExtractor` module to extract text. # In[3]: get_ipython().system('wget --user-agent="Mozilla" https://arxiv.org/pdf/2004.10703.pdf -O /tmp/downloaded_paper.pdf -q') text = TextExtractor().extract('/tmp/downloaded_paper.pdf') # In[4]: print(f"# of words in downloaded paper: {len(text.split())}") # ### Using N-Grams as the candidate generator # # Let's first use `ngrams` as the candidate generator, which is comparatively fast: # In[5]: kwe = KeywordExtractor() # In[6]: get_ipython().run_cell_magic('time', '', "kwe.extract_keywords(text, candidate_generator='ngrams')\n") # ### Using Noun Phrases as the candidate generator # # # If we use `noun_phrases` as the candidate generator instead, quality improves slightly at the expense of a longer running time. # In[8]: get_ipython().run_cell_magic('time', '', "kwe.extract_keywords(text, candidate_generator='noun_phrases')\n") # ### Other Parameters # The `extract_keywords` method has many other parameters to control the output. For instance, you can control the number of words in keyphrases with the `ngram_range` parameter. Here, we extract 3-word keyphrases: # In[9]: kwe.extract_keywords(text, candidate_generator='noun_phrases', ngram_range=(3,3)) # ### Combining All the Steps: Low-Code Keyphrase Extraction # In[10]: from ktrain.text.kw import KeywordExtractor from ktrain.text.textextractor import TextExtractor get_ipython().system('wget --user-agent="Mozilla" https://arxiv.org/pdf/2004.10703.pdf -O /tmp/downloaded_paper.pdf -q') text = TextExtractor().extract('/tmp/downloaded_paper.pdf') kwe = KeywordExtractor() kwe.extract_keywords(text, candidate_generator='noun_phrases') # ### Non-English Keyphrase Extraction # # Keyphrases can be extracted for non-English languages by supplying a 2-character language code as the `lang` argument. For simplified or traditional Chinese, use `zh`. # # #### Chinese # In[11]: text = """ 监督学习是学习一个函数的机器学习任务 根据样本输入-输出对将输入映射到输出。他推导出一个 函数来自由一组训练示例组成的标记训练数据。 在监督学习中,每个示例都是由一个输入对象组成的对 (通常是一个向量)和一个期望的输出值(也称为监控信号)。 监督学习算法分析训练数据并产生推断函数, 可用于映射新示例。最佳方案将允许 算法来正确确定不可见实例的类标签。这需要 学习算法从训练数据泛化到新情况 “合理”的方式(见归纳偏差)。 """ kwe = KeywordExtractor(lang='zh') kwe.extract_keywords(text) # #### French # In[12]: text = """L'apprentissage supervisé est la tâche d'apprentissage automatique consistant à apprendre une fonction qui mappe une entrée à une sortie sur la base d'exemples de paires entrée-sortie. Il en déduit une fonction à partir de données d'entraînement étiquetées constituées d'un ensemble d'exemples d'entraînement. En apprentissage supervisé, chaque exemple est une paire composée d'un objet d'entrée (généralement un vecteur) et une valeur de sortie souhaitée (également appelée signal de supervision). Un algorithme d'apprentissage supervisé analyse les données d'apprentissage et produit une fonction inférée, qui peut être utilisé pour cartographier de nouveaux exemples. Un scénario optimal permettra algorithme pour déterminer correctement les étiquettes de classe pour les instances invisibles. Cela nécessite l'algorithme d'apprentissage pour généraliser à partir des données d'entraînement à des situations inédites dans un manière « raisonnable » (voir biais inductif).""" kwe = KeywordExtractor(lang='fr') kwe.extract_keywords(text) # The following languages are supported: # In[13]: from ktrain.text.kw.core import SUPPORTED_LANGS for k,v in SUPPORTED_LANGS.items(): print(k,v) # ### Scalability # The `KeywordExtractor` is a already fast. With parallelization, keyphrase extraction can easily scale to a large number of documents. # In[14]: text = """ Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal). A supervised learning algorithm analyzes the training data and produces an inferred function, which can be used for mapping new examples. An optimal scenario will allow for the algorithm to correctly determine the class labels for unseen instances. This requires the learning algorithm to generalize from the training data to unseen situations in a 'reasonable' way (see inductive bias). """ docs = [text] * 10000 kwe = KeywordExtractor() # We can process these 10,000 documents using 8 processors in only a few seconds: # In[16]: get_ipython().run_cell_magic('time', '', 'from joblib import Parallel, delayed\nresults = Parallel(n_jobs=8)(delayed(kwe.extract_keywords)(doc) for doc in docs)\n') # In[17]: print(f'# of results is {len(results)}') results[0] # In[ ]: