Demo for SciPy 2021, part 0: Setup¶

Based on Market_Intelligence_Part1.ipynb.

This notebook precomputes the model results used in parts 1 and 2 so that we don't need network access to run those parts.

In [1]:

# Import Python libraries
from typing import *
import json
import os
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core
import pandas as pd
import text_extensions_for_pandas as tp
import transformers

if "IBM_API_KEY" not in os.environ:
    raise ValueError("IBM_API_KEY environment variable not set. Please create "
                     "a free instance of IBM Watson Natural Language Understanding "
                     "(see https://www.ibm.com/cloud/watson-natural-language-understanding) "
                     "and set the IBM_API_KEY environment variable to your instance's "
                     "API key value.")
api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  


# Initialize the Watson NLU Python API
natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
    version="2021-01-01",
    authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
)
natural_language_understanding.set_service_url(service_url)

# Github notebook gists will be this wide: ------------------>
# Screenshots of this notebook should be this wide: ----------------------------->

In [2]:

# Show the document URL used
doc_url = "https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems"
doc_url

Out[2]:

'https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems'

In [3]:

# Rerun NLU, capture all results, and convert to DataFrames
nlu_results = natural_language_understanding.analyze(
    url=doc_url,
    return_analyzed_text=True,
    features=nlu.Features(
        entities=nlu.EntitiesOptions(mentions=True),
        semantic_roles=nlu.SemanticRolesOptions())).get_result()

dataframes = tp.io.watson.nlu.parse_response(nlu_results)

In [4]:

nlu_results.keys()

Out[4]:

dict_keys(['usage', 'semantic_roles', 'retrieved_url', 'language', 'entities', 'analyzed_text'])

In [5]:

# Generate the Watson version of person mentions as raw JSON
person_mentions_watson_json = [
    e for e in nlu_results["entities"] if e["type"] == "Person"]

person_mentions_watson_json

Out[5]:

[{'type': 'Person',
  'text': 'Christoph Herman',
  'relevance': 0.217154,
  'mentions': [{'text': 'Christoph Herman',
    'location': [1213, 1229],
    'confidence': 0.94435}],
  'count': 1,
  'confidence': 0.94435},
 {'type': 'Person',
  'text': 'Stephen Leonard',
  'relevance': 0.136166,
  'mentions': [{'text': 'Stephen Leonard',
    'location': [2227, 2242],
    'confidence': 0.989177}],
  'disambiguation': {'name': 'Steve_Leonard',
   'dbpedia_resource': 'http://dbpedia.org/resource/Steve_Leonard'},
  'count': 1,
  'confidence': 0.989177},
 {'type': 'Person',
  'text': 'Sam Ponedal',
  'relevance': 0.020711,
  'mentions': [{'text': 'Sam Ponedal',
    'location': [3574, 3585],
    'confidence': 0.894298}],
  'count': 1,
  'confidence': 0.894298}]

In [6]:

# Generate the Watson Person output as a DataFrame
entity_mentions = dataframes["entity_mentions"]

person_mentions_watson = (
     entity_mentions[entity_mentions["type"] == "Person"]
     [["span", "confidence"]].rename(columns={"span": "person"}))
person_mentions_watson

Out[6]:

	person	confidence
38	[1213, 1229): 'Christoph Herman'	0.944350
41	[2227, 2242): 'Stephen Leonard'	0.989177
48	[3574, 3585): 'Sam Ponedal'	0.894298

In [7]:

# Generate the filtered SRL output as a DataFrame and add offset info
semantic_roles = dataframes["semantic_roles"]
doc_text = entity_mentions["span"].array.document_text

quotes = semantic_roles[semantic_roles["action.normalized"] == "say"].reset_index(drop=True)

# Add location info that isn't present in the output of Watson NLU
for colname in ("subject", "object"):
    begins = pd.Series([doc_text.index(s) for s in quotes[f"{colname}.text"]], dtype=int)
    ends = begins + quotes[f"{colname}.text"].str.len()
    quotes[colname] = tp.SpanArray(doc_text, begins, ends)

quotes = (quotes[["subject", "action.text", "object"]]
          .rename(columns={"action.text": "verb"}))
someone_said_something_df = quotes
someone_said_something_df

Out[7]:

	subject	verb	object
0	[1213, 1281): 'Christoph Herman, SVP and Head ...	said	[937, 1205): 'SAP HANA Enterprise Cloud on IBM...
1	[2227, 2519): 'Stephen Leonard, General Manage...	said	[2028, 2219): 'In June, IBM announced the avai...

In [8]:

# Code for slides: Run the Watson NLU semantic_roles model

semantic_roles_results = (
    natural_language_understanding
        .analyze(url=doc_url, features=nlu.Features(
            semantic_roles=nlu.SemanticRolesOptions()))
    .get_result()["semantic_roles"])
someone_said_something = [r for r in semantic_roles_results 
                          if r["action"]["normalized"] == "say"]
for s in someone_said_something:
    s["subject"]["begin"] = doc_text.find(s["subject"]["text"])
    s["subject"]["end"] = s["subject"]["begin"] + len(s["subject"]["text"])
    

someone_said_something_json = someone_said_something
someone_said_something_json

Out[8]:

[{'subject': {'text': 'Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery',
   'begin': 1213,
   'end': 1281},
  'sentence': ' "SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO," said Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery.',
  'object': {'text': 'SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO'},
  'action': {'verb': {'text': 'say', 'tense': 'past'},
   'text': 'said',
   'normalized': 'say'}},
 {'subject': {'text': 'Stephen Leonard, General Manager, IBM Cognitive Systems, "With the addition of IBM Power Systems in SAP HANA Enterprise Cloud, we\'re giving our clients more choices and greater flexibility to run their workloads where they want to across the hybrid cloud and accelerate digital transformation',
   'begin': 2227,
   'end': 2519},
  'sentence': ' "In June, IBM announced the availability of POWER9 in the IBM Cloud, taking the first step toward our goal of bringing IBM Cognitive Systems technology to our clients, no matter where they are," said Stephen Leonard, General Manager, IBM Cognitive Systems, "With the addition of IBM Power Systems in SAP HANA Enterprise Cloud, we\'re giving our clients more choices and greater flexibility to run their workloads where they want to across the hybrid cloud and accelerate digital transformation."',
  'object': {'text': 'In June, IBM announced the availability of POWER9 in the IBM Cloud, taking the first step toward our goal of bringing IBM Cognitive Systems technology to our clients, no matter where they are'},
  'action': {'verb': {'text': 'say', 'tense': 'past'},
   'text': 'said',
   'normalized': 'say'}}]

In [9]:

# Code for slides: Run the document text through the Hugging Face NER model

ner = transformers.pipeline("ner")
tagged_tokens = ner(doc_text)
model_results = ner.group_entities(tagged_tokens)
person_mentions = [d for d in model_results if d["entity_group"] == "PER"]
person_mentions

Out[9]:

[{'entity_group': 'PER',
  'score': 0.9996308088302612,
  'word': 'Christoph Herman',
  'start': 1213,
  'end': 1229}]

In [10]:

# Convert the Hugging Face model outputs to a DataFrame
person_mentions_df = pd.DataFrame({
    "person": tp.SpanArray(doc_text, [m["start"] for m in person_mentions],
                           [m["end"] for m in person_mentions]),
    "score": [m["score"] for m in person_mentions]
})
person_mentions_df

Out[10]:

	person	score
0	[1213, 1229): 'Christoph Herman'	0.999631

In [11]:

# Write out all the data we've generated
output_dir = "./scipy_demo_data"

###################
# Inputs to Part 1

# Tweak the output of the Hugging Face model to be JSON-serializable
for m in person_mentions:
    m["start"] = int(m["start"])
    m["end"] = int(m["end"])
with open(f"{output_dir}/person_mentions.json", "w") as f:
    json.dump(person_mentions, f)
    
with open(f"{output_dir}/person_mentions_watson.json", "w") as f:
    json.dump(person_mentions_watson_json, f)
    
with open(f"{output_dir}/someone_said_something.json", "w") as f:
    json.dump(someone_said_something_json, f)
    
###################
# Inputs to Part 2

person_mentions_df.to_parquet(f"{output_dir}/person_mentions.parquet")
person_mentions_watson.to_parquet(f"{output_dir}/person_mentions_watson.parquet")
someone_said_something_df.to_parquet(f"{output_dir}/someone_said_something.parquet")

In [ ]: