#!/usr/bin/env python
# coding: utf-8
#
Text Extensions for Pandas
# Interactive Dataframe Widget
# The interactive dataframe widget is an application within the IBM CODAIT team's open source Python library: Text Extension for Pandas. The widget aims to provide data scientists with a meaningful, visual way to interpret NLP (Natural Language Processing) data.
# This demo will walk you though an example session of using the widget and related visualizers provided in the ```jupyter``` sub-module of Text Extensions for Pandas.
# In[1]:
import os
import regex
import sys
import numpy as np
import pandas as pd
# And of course we need the text_extensions_for_pandas library itself.
try:
import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
# If we're running from within the project source tree and the parent Python
# environment doesn't have the text_extensions_for_pandas package, use the
# version in the local source tree.
if not os.getcwd().endswith("notebooks"):
raise e
if ".." not in sys.path:
sys.path.insert(0, "..")
import text_extensions_for_pandas as tp
# This demo will make use of the CoNLL-2003 dataset, a dataset concerning named entity recognition (Named Entity Extraction). We will be looking at a token classification problem - analyzing the building blocks of natural language present in this dataset that we can process and feed into a machine learning algorithm. The dataset contains categorical entity classifications of ```locations (LOC)```, ```persons (PER)```, ```organizations (ORG)``` and ```miscellaneous (MISC)```.
#
# Our goal is to load up some data from this dataset and do some basic processing and analysis, and make corrections if necessary.
#
# We will use Text Extensions for Pandas to download and parse the CoNLL dataset into dataframes to work with.
# In[2]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
# to the terms of the license when using this data set!
data_set_info = tp.io.conll.maybe_download_conll_data("outputs")
data_set_info
# In[3]:
gold_standard = tp.io.conll.conll_2003_to_dataframes(
data_set_info["test"], ["pos", "phrase", "ent"], [False, True, True])
gold_standard = [
df.drop(columns=["pos", "phrase_iob", "phrase_type"])
for df in gold_standard
]
# Once we have our dataset downloaded and parsed, we can prepare our dataframe for visualization.
# In[4]:
tokens = gold_standard[0]
tokens
# In[5]:
entity_mentions = tp.io.conll.iob_to_spans(tokens)
entity_mentions.head()
# In[6]:
sentences = tokens["sentence"].unique()
entity_sentence_pairs = tp.spanner.contain_join(pd.Series(sentences), entity_mentions["span"], "sentence", "span")
entity_mentions = entity_mentions.merge(entity_sentence_pairs)
entity_mentions["sentence_id"] = entity_mentions["sentence"].array.begin
entity_mentions.head()
# We can take a closer look at what the ```span``` column might look like in context by viewing the column alone as the SpanArray datatype.
# In[7]:
entity_mentions["sentence"].unique()
# We don't really want to visualize every column in our dataframe as we're only interested in viewing the entity classifications. The next step is to drop any columns we don't care about.
# Now that our data is prepared for analysis, we can load it up in our widget.
# In[8]:
widget = tp.jupyter.DataFrameWidget(entity_mentions.drop(columns=["sentence"]))
widget.display()
# If we want to view this widget interactively, we can pass in the additional parameter ```interactive_columns``` with an array of column names we want to become interactive widgets.
#
# One thing you may notice in the above widgets is that the column ```ent_type``` is editable via a text box. This is fine, but there is a more appropriate way to interact with categorical data.
# In[9]:
categorical = pd.Categorical(entity_mentions["ent_type"], categories=["PER", "LOC", "ORG", "MISC"])
entity_mentions["ent_type"] = categorical
tp.jupyter.DataFrameWidget(entity_mentions.drop(columns=["sentence", "sentence_id"]), interactive_columns=["ent_type"]).display()
# In[10]:
corrected_entities = entity_mentions.copy(True)
new_types = corrected_entities["ent_type"].copy()
new_types[widget.selected] = "ORG"
corrected_entities["new_type"] = new_types
corrected_entities
# In[ ]: