In [1]:

# Import Python libraries
from typing import *
import os
#import ibm_watson
#import ibm_watson.natural_language_understanding_v1 as nlu
#import ibm_cloud_sdk_core
import pandas as pd
import spacy
import sys
from IPython.core.display import display, HTML
import textwrap

# And of course we need the text_extensions_for_pandas library itself.
_PROJECT_ROOT = "../.."
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("market"):
        raise e
    if _PROJECT_ROOT not in sys.path:
        sys.path.insert(0, _PROJECT_ROOT)
    import text_extensions_for_pandas as tp
    
# Download the SpaCy model if necessary
try:
    spacy.load("en_core_web_trf")
except IOError:
    raise IOError("SpaCy dependency parser not found. Please run "
                  "'python -m spacy download en_core_web_trf', then "
                  "restart JupyterLab.")


if "IBM_API_KEY" not in os.environ:
    raise ValueError("IBM_API_KEY environment variable not set. Please create "
                     "a free instance of IBM Watson Natural Language Understanding "
                     "(see https://www.ibm.com/cloud/watson-natural-language-understanding) "
                     "and set the IBM_API_KEY environment variable to your instance's "
                     "API key value.")

api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  
# natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
#     version="2021-01-01",
#     authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
# )
# natural_language_understanding.set_service_url(service_url)

# Github notebook gists will be this wide: ------------------>
# Screenshots of this notebook should be this wide: ----------------------------->

In [19]:

# Code from the Github gist at https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d
# Be sure to update this cell if the gist changes!

import pandas as pd
import text_extensions_for_pandas as tp
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core

def find_persons_quoted_by_name(doc_url, api_key, service_url) -> pd.DataFrame:
    # Ask Watson Natural Language Understanding to run its "semantic_roles"
    # and "entities" models.
    natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
        version="2021-01-01",
        authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
    )
    natural_language_understanding.set_service_url(service_url)
    nlu_results = natural_language_understanding.analyze(
        url=doc_url,
        return_analyzed_text=True,
        features=nlu.Features(
            entities=nlu.EntitiesOptions(mentions=True),
            semantic_roles=nlu.SemanticRolesOptions())).get_result()
    
    # Convert the output of Watson Natural Language Understanding to DataFrames.
    dataframes = tp.io.watson.nlu.parse_response(nlu_results)
    entity_mentions_df = dataframes["entity_mentions"]
    semantic_roles_df = dataframes["semantic_roles"]
    
    # Extract mentions of person names and company names
    person_mentions_df = entity_mentions_df[entity_mentions_df["type"] == "Person"]
    
    # Extract instances of subjects that made statements
    quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] == "say"]
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)
    
        # Retrieve the full document text from the entity mentions output.
    doc_text = entity_mentions_df["span"].array.document_text

    # Filter down to just the rows and columns we're interested in
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Use String.index() to find where the strings in "subject.text" begin
    subjects_df["begin"] = pd.Series(
        [doc_text.index(s) for s in subjects_df["subject.text"]], dtype=int)

    # Compute end offsets and wrap the <begin, end, text> triples in a SpanArray column
    subjects_df["end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len()
    subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"], subjects_df["end"])

    # Align subjects with person names
    execs_df = tp.spanner.contain_join(subjects_df["span"], 
                                       person_mentions_df["span"],
                                       "subject", "person")
    # Add on the document URL.
    execs_df["url"] = doc_url
    return execs_df[["person", "url"]]
    

Part 2: Using Pandas DataFrames to analyze sentence structure¶

In this article, we show how to use Pandas DataFrames to extract useful structure from the parse trees of English-language sentences.

Dependency parsing is a natural language processing technique that identifies the relationships between the words that make up a sentence. We can treat these relationships as the edges of a graph.

For example, here's the graph that a dependency parser produces for the sentence, "I like natural language processing": Parse tree for the sentence "I like natural language processing"

In [3]:

# Do not include this cell in the blog post.
# Code to generate the above image
import spacy

spacy_language_model = spacy.load("en_core_web_trf")
token_features = tp.io.spacy.make_tokens_and_features(
    "I like natural language processing.", spacy_language_model)
tp.io.spacy.render_parse_tree(token_features)

This graph is always a tree, so we call it the dependency-based parse tree of the sentence. We often shorten the phrase "dependency-based parse tree" to dependency parse or parse tree.

Every word in the sentence (including the period at the end) becomes a node of the parse tree: Parse tree for the sentence "I like natural language processing". Each word of the sentence becomes a node of the tree.

The most important verb in the sentence becomes the root of the tree. We call this root node the head node. In this example, the head node is the verb "like".

Edges in the tree connect pairs of related words: Parse tree for the sentence "I like natural language processing". Relationships between words form the edges of the tree.

Each edge is tagged with information about why the words are related. For example, the first two words in the sentence, "I" and "like", have an nsubj relationship. The pronoun "I" is the subject for the verb "like".

Dependency parsing is useful because it lets you solve problems with very little code. The parser acts as a universal machine learning model, extracting many facts at once from the text. Pattern matching over the parse tree lets you filter this set of facts down to the ones that are relevant to your application.

An enterprise application of dependency parsing¶

In a previous article, we showed how to use Watson Natural Language Understanding to find places where a press release quotes an executive by name. In this article, we'll use dependency parsing to associate those names with job titles.

A person's job title is a valuable piece of context. The title can tell you whether the person is an important decision maker. Titles can tell you relationship between different employees at a company. By looking at how titles change over time, you can reconstruct a person's job history.

In [4]:

# Don't include this cell in the blog

# Code to generate parse tree of entire sentence
# Take a screenshot at 25% to create the png version.
quote_text = '''\
"By combining the power of AI with the flexibility and agility of hybrid cloud, \
our clients are driving innovation and digitizing their operations at a fast \
pace," said Daniel Hernandez, general manager, Data and AI, IBM.'''

tokens = tp.io.spacy.make_tokens_and_features(quote_text, spacy_language_model)
print(f"{len(tokens.index)} tokens")
tp.io.spacy.render_parse_tree(tokens)

45 tokens

Here's an example of how names and job titles can appear in press releases. This example is from an IBM press release from December 2020:

By combining the power of AI with the flexibility and agility of hybrid cloud,
our clients are driving innovation and digitizing their operations at a fast
pace," said Daniel Hernandez, general manager, Data and AI, IBM.

This sentence is 45 words long, so the entire parse tree is a bit daunting...

Dependency parse of the example sentence

...but if we zoom in on just the phrase, "Daniel Hernandez, general manager, Data and AI, IBM," some structure becomes clear:

Subtrees for Daniel Hernandez's name and his job title are connected by an `appos` edge.

The arrows in the diagram point "downwards", from root to leaf. The entire job title is a child of the name. There's a single edge from the head (highest) node of Daniel Hernandez's name to the head node of his job title.

The edge types in this parse tree come from the Universal Dependencies framework. The edge between the name and job title has the type appos. appos is short for "appositional modifier", or appositive. An appositive is a noun that describes another noun. In this case, the noun phrase "general manager, Data and AI, IBM" describes the noun phrase "Daniel Hernandez".

The pattern in the picture above happens whenever a person's job title is an appositive for that person's name. The title will be below the name in the tree, and the head nodes of the name and title will be connected by an appos edge. We can use this pattern to find the job title via a three-step process:

Look for an appos edge coming out of any of the parse tree nodes for the name.
The node at the other end of this edge should be the head node of the job title.
Find all the other nodes that are reachable from the head node of the job title.

Remember that each node represents a word. Once you know all the nodes that make up the job title, you know all the words in the title.

Step 3 here requires a transitive closure operation:

Start with a set of nodes consisting of just the head node
Look for nodes that are connected to nodes of the set. Add those nodes to the set.
Repeat the previous step until your set of nodes stops growing.

We can implement this algorithm with Pandas DataFrames.

Transitive closure with Pandas¶

We're going to use Pandas to match person names with job titles. The first thing we'll need is the locations of the person names. In our previous post, we created a function find_persons_quoted_by_name() that finds all the people that a news article quotes by name. If you're curious, you can find the source code here. The function produces a DataFrame with the location of each person name. Here's the output when you run the function over an example press release:

In [5]:

doc_url = "https://newsroom.ibm.com/2020-12-02-IBM-Named-a-Leader-in-the-2020-IDC-MarketScape-For-Worldwide-Advanced-Machine-Learning-Software-Platform"
persons = find_persons_quoted_by_name(doc_url, api_key, 
                                      service_url)
persons

Out[5]:

	person	url
0	[1288, 1304): 'Daniel Hernandez'	https://newsroom.ibm.com/2020-12-02-IBM-Named-...
1	[1838, 1849): 'Curren Katz'	https://newsroom.ibm.com/2020-12-02-IBM-Named-...
2	[2476, 2486): 'Ritu Jyoti'	https://newsroom.ibm.com/2020-12-02-IBM-Named-...

The second thing we will need is a parse tree. We'll use the dependency parser from the SpaCy NLP library. Our open source library Text Extensions for Pandas can convert the output of this parser into a DataFrame:

In [6]:

import spacy
import text_extensions_for_pandas as tp

# The original document had HTML tags. Get the detagged text.
doc_text = persons["person"].array.document_text

# Run dependency parsing and convert the parse to a DataFrame.
spacy_language_model = spacy.load("en_core_web_trf")
all_token_features = tp.io.spacy.make_tokens_and_features(
    doc_text, spacy_language_model)

# Drop the columns we won't need for this analysis.
tokens = all_token_features[["id", "span", "dep", "head", 
                             "sentence"]]
tokens

Out[6]:

	id	span	dep	head	sentence
0	0	[0, 6): 'ARMONK'	ROOT	0	[0, 42): 'ARMONK, N.Y., Dec. 2, 2020 /PRNewswi...
1	1	[6, 7): ','	punct	0	[0, 42): 'ARMONK, N.Y., Dec. 2, 2020 /PRNewswi...
2	2	[8, 12): 'N.Y.'	appos	0	[0, 42): 'ARMONK, N.Y., Dec. 2, 2020 /PRNewswi...
3	3	[12, 13): ','	punct	0	[0, 42): 'ARMONK, N.Y., Dec. 2, 2020 /PRNewswi...
4	4	[14, 18): 'Dec.'	npadvmod	0	[0, 42): 'ARMONK, N.Y., Dec. 2, 2020 /PRNewswi...
...	...	...	...	...	...
761	761	[4248, 4266): 'tballen@us.ibm.com'	appos	751	[4196, 4278): 'Media Contact: Tyler Allen IBM ...
762	762	[4266, 4267): ''	punct	751	[4196, 4278): 'Media Contact: Tyler Allen IBM ...
763	763	[4267, 4273): 'SOURCE'	appos	751	[4196, 4278): 'Media Contact: Tyler Allen IBM ...
764	764	[4274, 4277): 'IBM'	appos	763	[4196, 4278): 'Media Contact: Tyler Allen IBM ...
765	765	[4277, 4278): ''	punct	751	[4196, 4278): 'Media Contact: Tyler Allen IBM ...

766 rows × 5 columns

This tokens DataFrame contains one row for every token in the document. The term "token" here refers to a part of the document that is a word, an abbreviation, or a piece of punctuation. The columns "id", "dep" and "head" encode the edges of the parse tree.

Since we're going to be analyzing the parse tree, it's more convenient to have the nodes and edges in separate DataFrames. So let's split tokens into DataFrames of nodes and edges:

In [7]:

nodes = tokens[["id", "span"]].reset_index(drop=True)
edges = tokens[["id", "head", "dep"]].reset_index(drop=True)

In [8]:

nodes

Out[8]:

	id	span
0	0	[0, 6): 'ARMONK'
1	1	[6, 7): ','
2	2	[8, 12): 'N.Y.'
3	3	[12, 13): ','
4	4	[14, 18): 'Dec.'
...	...	...
761	761	[4248, 4266): 'tballen@us.ibm.com'
762	762	[4266, 4267): ''
763	763	[4267, 4273): 'SOURCE'
764	764	[4274, 4277): 'IBM'
765	765	[4277, 4278): ''

766 rows × 2 columns

In [9]:

edges

Out[9]:

	id	head	dep
0	0	0	ROOT
1	1	0	punct
2	2	0	appos
3	3	0	punct
4	4	0	npadvmod
...	...	...	...
761	761	751	appos
762	762	751	punct
763	763	751	appos
764	764	763	appos
765	765	751	punct

766 rows × 3 columns

We will start with the nodes that are parts of person names. To find these nodes, we need to match the person names in person with tokens in nodes.

The "person" column of persons and the "span" column in nodes both hold span data. Spans are a common concept in natural language processing. A span represents a region of the document, usually as begin and end offsets and a reference to the document's text. The span data in these two DataFrames is stored using the SpanDtype extension type from Text Extensions for Pandas.

Text Extensions for Pandas also includes functions for manipulating span data. We can use one of these functions, overlap_join(), to find all the places where a token from nodes overlaps with a person name from persons:

In [10]:

person_nodes = (
    tp.spanner.overlap_join(persons["person"], nodes["span"],
                            "person", "span")
    .merge(nodes)
)
person_nodes

Out[10]:

	person	span	id
0	[1288, 1304): 'Daniel Hernandez'	[1288, 1294): 'Daniel'	233
1	[1288, 1304): 'Daniel Hernandez'	[1295, 1304): 'Hernandez'	234
2	[1838, 1849): 'Curren Katz'	[1838, 1844): 'Curren'	335
3	[1838, 1849): 'Curren Katz'	[1845, 1849): 'Katz'	336
4	[2476, 2486): 'Ritu Jyoti'	[2476, 2480): 'Ritu'	462
5	[2476, 2486): 'Ritu Jyoti'	[2481, 2486): 'Jyoti'	463

This set of nodes defines a starting point for navigating the parse tree. Now we need to look for nodes that are on the other side of an appos link. Since the nodes and edges of our graph are Pandas DataFrames, we can use the Pandas merge() method to match edges with nodes and walk the graph. Here's a function that finds all the nodes that are one edge away from the nodes in its argument start_nodes:

In [11]:

def traverse_edges_once(start_nodes: pd.DataFrame, 
                        edges: pd.DataFrame,
                        metadata_cols = ["person"]) -> pd.DataFrame:
    return (
        start_nodes[["person", "id"]]  # Propagate original "person" span
        .merge(edges, left_on="id", right_on="head", 
               suffixes=["_head", ""])[["person", "id"]]
        .merge(nodes)
    )

Now we can find all the nodes that are reachable by traversing an appos link downward from part of a person name:

In [12]:

appos_targets = \
    traverse_edges_once(person_nodes, 
                        edges[edges["dep"] == "appos"])
appos_targets

Out[12]:

	person	id	span
0	[1288, 1304): 'Daniel Hernandez'	237	[1314, 1321): 'manager'
1	[1838, 1849): 'Curren Katz'	338	[1851, 1859): 'Director'
2	[2476, 2486): 'Ritu Jyoti'	467	[2501, 2510): 'president'

Each element of the "span" column of appos_targets holds the head node of a person's title. To find the remaining nodes of the titles, we'll do the transitive closure operation we described earlier. We use a Pandas DataFrame to store our set of selected nodes. We use the traverse_edges_once function to perform each step of walking the tree. Then we use Pandas.concat() and DataFrame.drop_duplicates() to add the new nodes to our selected set of nodes. The entire algorithm looks like this:

In [13]:

# Start with the root nodes of the titles.
selected_nodes = appos_targets.copy()

# Transitive closure. 
# Keep going as long as the previous round enlarged our set.
previous_num_nodes = 0
while len(selected_nodes.index) > previous_num_nodes:

    # Find all the nodes that are directly reachable from 
    # the selected set.
    addl_nodes = traverse_edges_once(selected_nodes, edges)

    # Merge the new nodes into the selected set.
    previous_num_nodes = len(selected_nodes.index)
    selected_nodes = (pd.concat([selected_nodes, addl_nodes])
                      .drop_duplicates())

selected_nodes

Out[13]:

	person	id	span
0	[1288, 1304): 'Daniel Hernandez'	237	[1314, 1321): 'manager'
1	[1838, 1849): 'Curren Katz'	338	[1851, 1859): 'Director'
2	[2476, 2486): 'Ritu Jyoti'	467	[2501, 2510): 'president'
0	[1288, 1304): 'Daniel Hernandez'	236	[1306, 1313): 'general'
1	[1288, 1304): 'Daniel Hernandez'	238	[1321, 1322): ','
2	[1288, 1304): 'Daniel Hernandez'	239	[1323, 1327): 'Data'
3	[1288, 1304): 'Daniel Hernandez'	242	[1334, 1335): ','
4	[1288, 1304): 'Daniel Hernandez'	243	[1336, 1339): 'IBM'
5	[1838, 1849): 'Curren Katz'	339	[1860, 1862): 'of'
6	[1838, 1849): 'Curren Katz'	343	[1879, 1880): ','
7	[1838, 1849): 'Curren Katz'	345	[1890, 1896): 'Health'
8	[2476, 2486): 'Ritu Jyoti'	465	[2488, 2495): 'program'
9	[2476, 2486): 'Ritu Jyoti'	466	[2496, 2500): 'vice'
10	[2476, 2486): 'Ritu Jyoti'	468	[2510, 2511): ','
11	[2476, 2486): 'Ritu Jyoti'	470	[2515, 2523): 'research'
12	[2476, 2486): 'Ritu Jyoti'	471	[2524, 2528): 'with'
13	[1288, 1304): 'Daniel Hernandez'	240	[1328, 1331): 'and'
14	[1288, 1304): 'Daniel Hernandez'	241	[1332, 1334): 'AI'
15	[1838, 1849): 'Curren Katz'	342	[1876, 1879): 'R&D'
16	[1838, 1849): 'Curren Katz'	344	[1881, 1889): 'Highmark'
17	[2476, 2486): 'Ritu Jyoti'	469	[2512, 2514): 'AI'
18	[2476, 2486): 'Ritu Jyoti'	479	[2573, 2581): 'practice'
19	[1838, 1849): 'Curren Katz'	341	[1868, 1875): 'Science'
20	[2476, 2486): 'Ritu Jyoti'	472	[2529, 2532): 'IDC'
21	[2476, 2486): 'Ritu Jyoti'	476	[2551, 2559): 'research'
22	[1838, 1849): 'Curren Katz'	340	[1863, 1867): 'Data'
23	[2476, 2486): 'Ritu Jyoti'	473	[2532, 2534): ''s'
24	[2476, 2486): 'Ritu Jyoti'	474	[2535, 2543): 'software'
25	[2476, 2486): 'Ritu Jyoti'	475	[2544, 2550): 'market'
26	[2476, 2486): 'Ritu Jyoti'	477	[2560, 2563): 'and'
27	[2476, 2486): 'Ritu Jyoti'	478	[2564, 2572): 'advisory'

Now we know the spans of all the words that make up each job title. The "addition" operation for spans is defined as:

span1 + span2 = smallest span that contains both span1 and span2

We can recover the span of the entire title by "adding" spans using Pandas' groupby() method:

In [14]:

# Aggregate the nodes of each title to find the span of the 
# entire title.
titles = (
    selected_nodes
    .groupby("person")
    .aggregate({"span": "sum"})
    .reset_index()
    .rename(columns={"span": "title"})
)
titles

Out[14]:

	person	title
0	[1288, 1304): 'Daniel Hernandez'	[1306, 1339): 'general manager, Data and AI, IBM'
1	[1838, 1849): 'Curren Katz'	[1851, 1896): 'Director of Data Science R&D, H...
2	[2476, 2486): 'Ritu Jyoti'	[2488, 2581): 'program vice president, AI rese...

Now we have found a job title for each of the executive names in this document!

Tying it all together¶

Let's put all of the code we've presented so far into a single function.

In [15]:

# Keep the contents of this cell synchronized with the gist at
# https://gist.github.com/frreiss/a731438dda4ac948beca85d3fe167ff3
import pandas as pd
import text_extensions_for_pandas as tp

def find_titles_of_persons(persons: pd.DataFrame,
                           spacy_language_model) -> pd.DataFrame:
    """
    :param persons: DataFrame containing information about person names.
    :param spacy_language_model: Loaded SpaCy language model with dependency 
     parsing support.
    
    :returns: A DataFrame with a row for every title identified and two columns,
     "person" and "title".
    """
    def traverse_edges_once(start_nodes: pd.DataFrame, edges: pd.DataFrame,
                        metadata_cols = ["person"]) -> pd.DataFrame:
        return (
            start_nodes[["person", "id"]]  # Propagate original "person" span
            .merge(edges, left_on="id", right_on="head", 
                   suffixes=["_head", ""])[["person", "id"]]
            .merge(nodes)
        )
    
    if len(persons.index) == 0:
        # Special case: Empty input --> empty output
        return pd.DataFrame({
            "person": pd.Series([], dtype=tp.SpanDtype()),
            "title": pd.Series([], dtype=tp.SpanDtype()),
        })
    

    # Retrieve the document text from the person spans.
    doc_text = persons["person"].array.document_text
    
    # Run dependency parsing on the text and convert the parse to a DataFrame.
    all_token_features = tp.io.spacy.make_tokens_and_features(doc_text, spacy_language_model)

    # Drop the columns we won't need for this analysis.
    tokens = all_token_features[["id", "span", "tag", "dep", "head", "sentence"]]
    
    # Split the parse tree into nodes and edges and filter the edges.
    nodes = tokens[["id", "span", "tag"]].reset_index(drop=True)
    edges = tokens[["id", "head", "dep"]].reset_index(drop=True)

    # Start with the nodes that are inside person names.
    person_nodes = (
        tp.spanner.overlap_join(persons["person"], nodes["span"],
                                "person", "span")
        .merge(nodes)
    )
    
    # Step 1: Follow `appos` edges from the person names
    appos_targets = traverse_edges_once(person_nodes, 
                                        edges[edges["dep"] == "appos"])
    
    # Step 2: Transitive closure to find all tokens in the titles
    selected_nodes = appos_targets.copy()
    previous_num_nodes = 0
    while len(selected_nodes.index) > previous_num_nodes:

        # Find all the nodes that are directly reachable from our selected set.
        addl_nodes = traverse_edges_once(selected_nodes, edges)

        # Merge the new nodes into the selected set
        previous_num_nodes = len(selected_nodes.index)
        selected_nodes = (pd.concat([selected_nodes, addl_nodes])
                          .drop_duplicates())

    # Aggregate the nodes of each title to find the span of the entire title.
    titles = (
        selected_nodes
        .groupby("person")
        .aggregate({"span": "sum"})
        .reset_index()
        .rename(columns={"span": "title"})
    )

    # As of Pandas 1.2.1, groupby() over extension types downgrades them to object 
    # dtype. Cast back up to the extension type.
    titles["person"] = titles["person"].astype(tp.SpanDtype())
    
    return titles
    

If we combine this find_titles_of_persons() function with the find_persons_quoted_by_name() function we created in our previous post, we can build a data mining pipeline. This pipeline finds the names and titles of executives in corporate press releases. Here's the output that we get if we pass a year's worth of IBM press releases through this pipeline:

In [16]:

# Don't include this cell in the blog post.

# Load press release URLs from a file
with open("ibm_press_releases.txt", "r") as f:
    lines = [l.strip() for l in f.readlines()]
    ibm_press_release_urls = [l for l in lines if len(l) > 0 and l[0] != "#"]

In [17]:

to_concat = []
for url in ibm_press_release_urls:
    persons = find_persons_quoted_by_name(url, api_key, 
                                          service_url)
    titles = find_titles_of_persons(persons, 
                                    spacy_language_model)
    titles["url"] = url
    to_concat.append(titles)
    
all_titles = pd.concat(to_concat).reset_index(drop=True)
all_titles

Out[17]:

	person	title	url
0	[1977, 1991): 'Wendi Whitmore'	[1993, 2040): 'Vice President, IBM X-Force Thr...	https://newsroom.ibm.com/2020-02-11-IBM-X-Forc...
1	[1281, 1292): 'Rob DiCicco'	[1294, 1348): 'PharmD, Deputy Chief Health Off...	https://newsroom.ibm.com/2020-02-18-IBM-Study-...
2	[1213, 1229): 'Christoph Herman'	[1231, 1281): 'SVP and Head of SAP HANA Enterp...	https://newsroom.ibm.com/2020-02-19-IBM-Power-...
3	[2227, 2242): 'Stephen Leonard'	[2244, 2282): 'General Manager, IBM Cognitive ...	https://newsroom.ibm.com/2020-02-19-IBM-Power-...
4	[2289, 2297): 'Bob Lord'	[2299, 2375): 'IBM Senior Vice President of Co...	https://newsroom.ibm.com/2020-02-26-2020-Call-...
...	...	...	...
254	[3114, 3124): 'Mike Doran'	[3126, 3157): 'Worldwide Sales Director at IBM'	https://newsroom.ibm.com/2021-01-25-OVHcloud-t...
255	[3155, 3169): 'Howard Boville'	[3171, 3210): 'Senior Vice President, IBM Hybr...	https://newsroom.ibm.com/2021-01-26-Luminor-Ba...
256	[3114, 3126): 'Samuel Brack'	[3127, 3152): 'Co-Founder and CTO at DIA'	https://newsroom.ibm.com/2021-01-26-DIA-Levera...
257	[3509, 3523): 'Hillery Hunter'	[3525, 3556): 'IBM Fellow, VP & CTO, IBM Cloud'	https://newsroom.ibm.com/2021-01-26-DIA-Levera...
258	[1487, 1497): 'Ana Zamper'	[1499, 1534): 'Ecosystem Leader, IBM Latin Ame...	https://newsroom.ibm.com/2021-01-26-Latin-Amer...

259 rows × 3 columns

Our pipeline has processed 191 press releases, and it found the names and titles of 259 executives!

To find out more about the extensions to Pandas that made this possible, check out Text Extensions for Pandas here.

In [18]:

# Don't include this cell in the blog.

# Check the last 50 rows
all_titles[-50:]

Out[18]:

	person	title	url
209	[1449, 1466): 'Justin Youngblood'	[1468, 1495): 'Vice President IBM Security'	https://newsroom.ibm.com/2020-12-01-IBM-Named-...
210	[1133, 1149): 'Daniel Hernandez'	[1151, 1184): 'General Manager, Data and AI, IBM'	https://newsroom.ibm.com/2020-12-02-IBM-Positi...
211	[2035, 2048): 'Vitaly Tsivin'	[2050, 2086): 'Executive Vice President of Bus...	https://newsroom.ibm.com/2020-12-02-IBM-Positi...
212	[1288, 1304): 'Daniel Hernandez'	[1306, 1339): 'general manager, Data and AI, IBM'	https://newsroom.ibm.com/2020-12-02-IBM-Named-...
213	[1838, 1849): 'Curren Katz'	[1851, 1896): 'Director of Data Science R&D, H...	https://newsroom.ibm.com/2020-12-02-IBM-Named-...
214	[2476, 2486): 'Ritu Jyoti'	[2488, 2581): 'program vice president, AI rese...	https://newsroom.ibm.com/2020-12-02-IBM-Named-...
215	[813, 825): 'Daniel Stumm'	[827, 861): 'ABB's Head of Indirect Procurement'	https://newsroom.ibm.com/2020-12-03-IBM-Helps-...
216	[2802, 2816): 'Neil McCormack'	[2818, 2866): 'managing partner - Geo Leader, ...	https://newsroom.ibm.com/2020-12-03-IBM-Helps-...
217	[3453, 3465): 'Luigi Menzio'	[3467, 3505): 'Services Executive Partner, IBM...	https://newsroom.ibm.com/2020-12-03-Piaggio-Gr...
218	[2164, 2180): 'Daniel Hernandez'	[2182, 2217): 'General Manager of Data and AI,...	https://newsroom.ibm.com/2020-12-09-IBM-Launch...
219	[2933, 2945): 'André Tamers'	[2947, 2976): 'owner of De Maison Selections'	https://newsroom.ibm.com/2020-12-10-eProvenanc...
220	[3508, 3526): 'Robin Grumman-Vogt'	[3528, 3546): 'CEO of eProvenance'	https://newsroom.ibm.com/2020-12-10-eProvenanc...
221	[1219, 1231): 'Aarti Borkar'	[1233, 1261): 'Vice President, IBM Security'	https://newsroom.ibm.com/2020-12-10-IBM-Collab...
222	[1410, 1422): 'Nick Kolesch'	[1424, 1482): 'Vice President for Projects, Al...	https://newsroom.ibm.com/2020-12-15-IBM-and-Th...
223	[1999, 2012): 'Manish Chawla'	[2013, 2092): 'Global Industry Managing Direct...	https://newsroom.ibm.com/2020-12-15-IBM-and-Th...
224	[793, 807): 'Michael Jacobs'	[809, 845): 'IBM Offering Manager, Sustainabil...	https://newsroom.ibm.com/2020-12-15-IBM-Launch...
225	[2738, 2753): 'Matt Larsen-Daw'	[2755, 2800): 'Education Manager, World Wide F...	https://newsroom.ibm.com/2020-12-15-IBM-Launch...
226	[3452, 3464): 'Tom Ackerman'	[3466, 3521): 'Vice President for Education, C...	https://newsroom.ibm.com/2020-12-15-IBM-Launch...
227	[441, 452): 'Mark Foster'	[454, 489): 'Senior Vice President, IBM Services'	https://newsroom.ibm.com/2020-12-15-IBM-Acquir...
228	[3126, 3141): 'Jacques Leblanc'	[3143, 3170): 'CEO and founder of Expertus'	https://newsroom.ibm.com/2020-12-15-IBM-Acquir...
229	[1396, 1411): 'Sridhar Muppidi'	[1413, 1451): 'Chief Technology Officer, IBM S...	https://newsroom.ibm.com/2020-12-17-IBM-Helps-...
230	[1323, 1332): 'Paul Roma'	[1334, 1368): 'General Manager, IBM Watson Hea...	https://newsroom.ibm.com/2020-12-18-IBM-and-Sa...
231	[1790, 1804): 'Bill Patterson'	[1806, 1848): 'EVP and GM, CRM Applications at...	https://newsroom.ibm.com/2020-12-18-IBM-and-Sa...
232	[1119, 1131): 'John Granger'	[1133, 1242): 'Senior Vice President, Cloud Ap...	https://newsroom.ibm.com/2020-12-21-IBM-to-Acq...
233	[1738, 1754): 'Fernando Herrera'	[1756, 1787): 'Chairman and Founder, Nordcloud'	https://newsroom.ibm.com/2020-12-21-IBM-to-Acq...
234	[1543, 1557): 'Jay Bellissimo'	[1559, 1612): 'IBM's General Manager, U.S. Pub...	https://newsroom.ibm.com/2020-12-21-IBM-Select...
235	[1022, 1036): 'Jay Bellissimo'	[1038, 1098): 'IBM's General Manager, U.S. Pub...	https://newsroom.ibm.com/2020-12-22-USDA-Taps-...
236	[1775, 1794): 'Archana Vemulapalli'	[1796, 1860): 'General Manager, IBM Infrastruc...	https://newsroom.ibm.com/2021-01-04-IBM-Study-...
237	[864, 885): 'Dr. Corey S. Bradford'	[887, 934): 'Sr., president of Harris-Stowe St...	https://newsroom.ibm.com/2021-01-05-IBM-Provid...
238	[1681, 1704): 'Valinda Scarbro Kennedy'	[1706, 1755): 'HBCU Program Lead, IBM Global U...	https://newsroom.ibm.com/2021-01-05-IBM-Provid...
239	[2157, 2172): 'Bashir Bseirani'	[2174, 2188): 'CEO at Avertra'	https://newsroom.ibm.com/2021-01-06-IBM-and-Av...
240	[1574, 1594): 'Oran Vongsuraphichet'	[1596, 1662): 'Chief Executive Officer of Thai...	https://newsroom.ibm.com/2021-01-06-Thai-Re-la...
241	[2153, 2170): 'Patama Chantaruck'	[2172, 2221): 'VP for Indochina Expansion and ...	https://newsroom.ibm.com/2021-01-06-Thai-Re-la...
242	[952, 966): 'Arvind Krishna'	[968, 991): 'Chairman and CEO of IBM'	https://newsroom.ibm.com/2021-01-07-IBM-Appoin...
243	[596, 607): 'Mark Foster'	[609, 644): 'Senior Vice President, IBM Services'	https://newsroom.ibm.com/2021-01-11-IBM-Acquir...
244	[1912, 1924): 'Tyler Prince'	[1926, 1996): 'Executive Vice President, World...	https://newsroom.ibm.com/2021-01-11-IBM-Acquir...
245	[3546, 3560): 'Paul Stillmank'	[3562, 3568): 'CEO of'	https://newsroom.ibm.com/2021-01-11-IBM-Acquir...
246	[647, 656): 'Darío Gil'	[658, 708): 'Senior Vice President and Directo...	https://newsroom.ibm.com/2021-01-12-IBM-Tops-U...
247	[3178, 3193): 'Alistair Rennie'	[3195, 3228): 'General Manager of IBM Blockchain'	https://newsroom.ibm.com/2021-01-13-Covalent-T...
248	[497, 509): 'John Granger'	[511, 620): 'Senior Vice President, Cloud Appl...	https://newsroom.ibm.com/2021-01-14-IBM-Boosts...
249	[2375, 2386): 'Hamilton Yu'	[2388, 2399): 'CEO of Taos'	https://newsroom.ibm.com/2021-01-14-IBM-Boosts...
250	[2164, 2180): 'Nourdine Bihmane'	[2182, 2255): 'Head of Decarbonization Busines...	https://newsroom.ibm.com/2021-01-19-Atos-and-I...
251	[2723, 2731): 'Bob Lord'	[2733, 2779): 'SVP Cognitive Applications and ...	https://newsroom.ibm.com/2021-01-19-Atos-and-I...
252	[315, 329): 'Arvind Krishna'	[331, 371): 'IBM chairman and chief executive ...	https://newsroom.ibm.com/2021-01-21-IBM-Report...
253	[2116, 2131): 'James Kavanaugh'	[2133, 2186): 'IBM senior vice president and c...	https://newsroom.ibm.com/2021-01-21-IBM-Report...
254	[3114, 3124): 'Mike Doran'	[3126, 3157): 'Worldwide Sales Director at IBM'	https://newsroom.ibm.com/2021-01-25-OVHcloud-t...
255	[3155, 3169): 'Howard Boville'	[3171, 3210): 'Senior Vice President, IBM Hybr...	https://newsroom.ibm.com/2021-01-26-Luminor-Ba...
256	[3114, 3126): 'Samuel Brack'	[3127, 3152): 'Co-Founder and CTO at DIA'	https://newsroom.ibm.com/2021-01-26-DIA-Levera...
257	[3509, 3523): 'Hillery Hunter'	[3525, 3556): 'IBM Fellow, VP & CTO, IBM Cloud'	https://newsroom.ibm.com/2021-01-26-DIA-Levera...
258	[1487, 1497): 'Ana Zamper'	[1499, 1534): 'Ecosystem Leader, IBM Latin Ame...	https://newsroom.ibm.com/2021-01-26-Latin-Amer...

In [ ]: