#!/usr/bin/env python # coding: utf-8 #

Text Extensions for Pandas

#

Interactive Dataframe Widget

# The interactive dataframe widget is an application within the IBM CODAIT team's open source Python library: Text Extension for Pandas. The widget aims to provide data scientists with a meaningful, visual way to interpret NLP (Natural Language Processing) data. # This demo will walk you though an example session of using the widget and related visualizers provided in the ```jupyter``` sub-module of Text Extensions for Pandas. # In[1]: import os import regex import sys import numpy as np import pandas as pd # And of course we need the text_extensions_for_pandas library itself. try: import text_extensions_for_pandas as tp except ModuleNotFoundError as e: # If we're running from within the project source tree and the parent Python # environment doesn't have the text_extensions_for_pandas package, use the # version in the local source tree. if not os.getcwd().endswith("notebooks"): raise e if ".." not in sys.path: sys.path.insert(0, "..") import text_extensions_for_pandas as tp # This demo will make use of the CoNLL-2003 dataset, a dataset concerning named entity recognition (Named Entity Extraction). We will be looking at a token classification problem - analyzing the building blocks of natural language present in this dataset that we can process and feed into a machine learning algorithm. The dataset contains categorical entity classifications of ```locations (LOC)```, ```persons (PER)```, ```organizations (ORG)``` and ```miscellaneous (MISC)```. # # Our goal is to load up some data from this dataset and do some basic processing and analysis, and make corrections if necessary. # # We will use Text Extensions for Pandas to download and parse the CoNLL dataset into dataframes to work with. # In[2]: # Download and cache the data set. # NOTE: This data set is licensed for research use only. Be sure to adhere # to the terms of the license when using this data set! data_set_info = tp.io.conll.maybe_download_conll_data("outputs") data_set_info # In[3]: gold_standard = tp.io.conll.conll_2003_to_dataframes( data_set_info["test"], ["pos", "phrase", "ent"], [False, True, True]) gold_standard = [ df.drop(columns=["pos", "phrase_iob", "phrase_type"]) for df in gold_standard ] # Once we have our dataset downloaded and parsed, we can prepare our dataframe for visualization. # In[4]: tokens = gold_standard[0] tokens # In[5]: entity_mentions = tp.io.conll.iob_to_spans(tokens) entity_mentions.head() # In[6]: sentences = tokens["sentence"].unique() entity_sentence_pairs = tp.spanner.contain_join(pd.Series(sentences), entity_mentions["span"], "sentence", "span") entity_mentions = entity_mentions.merge(entity_sentence_pairs) entity_mentions["sentence_id"] = entity_mentions["sentence"].array.begin entity_mentions.head() # We can take a closer look at what the ```span``` column might look like in context by viewing the column alone as the SpanArray datatype. # In[7]: entity_mentions["sentence"].unique() # We don't really want to visualize every column in our dataframe as we're only interested in viewing the entity classifications. The next step is to drop any columns we don't care about. # Now that our data is prepared for analysis, we can load it up in our widget. # In[8]: widget = tp.jupyter.DataFrameWidget(entity_mentions.drop(columns=["sentence"])) widget.display() # If we want to view this widget interactively, we can pass in the additional parameter ```interactive_columns``` with an array of column names we want to become interactive widgets. # # One thing you may notice in the above widgets is that the column ```ent_type``` is editable via a text box. This is fine, but there is a more appropriate way to interact with categorical data. # In[9]: categorical = pd.Categorical(entity_mentions["ent_type"], categories=["PER", "LOC", "ORG", "MISC"]) entity_mentions["ent_type"] = categorical tp.jupyter.DataFrameWidget(entity_mentions.drop(columns=["sentence", "sentence_id"]), interactive_columns=["ent_type"]).display() # In[10]: corrected_entities = entity_mentions.copy(True) new_types = corrected_entities["ent_type"].copy() new_types[widget.selected] = "ORG" corrected_entities["new_type"] = new_types corrected_entities # In[ ]: