In this notebook, we will guide you to the process of get data from Wikidata SparQL service and store it in a ChunkMapper, so that you can get that information at scale inside Finance NLP.
! pip install -q johnsnowlabs
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.8/74.8 KB 7.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 570.6/570.6 KB 48.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 212.4/212.4 MB 6.0 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 453.8/453.8 KB 20.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 95.4/95.4 KB 11.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 198.6/198.6 KB 21.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.9/66.9 KB 7.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 82.3/82.3 KB 9.0 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 29.9 MB/s eta 0:00:00 Building wheel for pyspark (setup.py) ... done Building wheel for databricks-cli (setup.py) ... done
Using my.johnsnowlabs.com SSO
from johnsnowlabs import nlp, legal
# nlp.install(force_browser=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
Please Upload your John Snow Labs License using the button below
Saving 4.2.8.json to 4.2.8.json
nlp.install()
👌 Detected license file /content/4.2.8.json 📋 Stored John Snow Labs License in /root/.johnsnowlabs/licenses/license_number_0_for_Spark-Healthcare_Spark-OCR.json 👷 Setting up John Snow Labs home in /root/.johnsnowlabs, this might take a few minutes. Downloading 🐍+🚀 Python Library spark_nlp-4.2.8-py2.py3-none-any.whl Downloading 🐍+💊 Python Library spark_nlp_jsl-4.2.8-py3-none-any.whl Downloading 🫘+🚀 Java Library spark-nlp-assembly-4.2.8.jar Downloading 🫘+💊 Java Library spark-nlp-jsl-4.2.8.jar 🙆 JSL Home setup in /root/.johnsnowlabs 👌 Detected license file /content/4.2.8.json Installing /root/.johnsnowlabs/py_installs/spark_nlp_jsl-4.2.8-py3-none-any.whl to /usr/bin/python3 Installed 1 products: 💊 Spark-Healthcare==4.2.8 installed! ✅ Heal the planet with NLP!
spark = nlp.start()
👌 Detected license file /content/4.2.8.json
👌 Launched cpu optimized session with with: 🚀Spark-NLP==4.2.8, 💊Spark-Healthcare==4.2.8, running on ⚡ PySpark==3.1.2
You probably know Wikipedia is built on top of one of the biggest ontologies, Wikidata.
Wikidata is based on triplets (subject, relation/verb, object), and it can be queried using SparQL (a triplets - RDF - OWL query language).
Don't confuse SparQL (the Query Language for ontologies) with Spark NLP.
The aim of this notebook is not to teach you SparQL (to do that, you can use resources as W3.org tutorial), but how to integrate the data you get with your SparQL queries on Wikidata into Finance NLP.
📚For this example, we are going to ask in Wikidata for:
(wd:Q783794
)...?relationship = wdt:P355 || ?relationship = wdt:P749 || ?relationship = wdt:P108
)...[pq:P249 ?ticker; ps:P414 ?exchange ]
)...SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
)FILTER (lang(?itemLabel) = 'en') FILTER (lang(?relatedCompanyLabel) = 'en')
)LIMIT 100
)SELECT DISTINCT ?id ?itemLabel ?exchangeLabel ?tickerLabel ?relationship ?relatedCompany ?relatedCompanyLabel
)📚This is the resulting query:
SELECT DISTINCT ?id ?itemLabel ?exchangeLabel ?tickerLabel ?relationship ?relatedCompany ?relatedCompanyLabel
WHERE {
?id wdt:P31/wdt:P279* wd:Q783794 ; p:P414 [pq:P249 ?ticker; ps:P414 ?exchange ] .
?id ?relationship ?relatedCompany .
?relatedCompany rdfs:label ?relatedCompanyLabel .
?id rdfs:label ?itemLabel .
FILTER ((?relationship = wdt:P355 || ?relationship = wdt:P749 || ?relationship = wdt:P108))
FILTER (lang(?itemLabel) = 'en')
FILTER (lang(?relatedCompanyLabel) = 'en')
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
LIMIT 100
If you run this query in the Wikidata SparQL service, you can visualize the results in many different ways, including tables, graphs, charts...
There are two ways you can access the results:
Download
Code
.It even automatically puts the query you are inspecting in the code, so that you only copy/paste the code and retrieve the results.
!pip install sparqlwrapper
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting sparqlwrapper Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB) Collecting rdflib>=6.1.1 Downloading rdflib-6.2.0-py3-none-any.whl (500 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 500.3/500.3 KB 23.7 MB/s eta 0:00:00 Requirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from rdflib>=6.1.1->sparqlwrapper) (57.4.0) Collecting isodate Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 KB 3.0 MB/s eta 0:00:00 Requirement already satisfied: pyparsing in /usr/local/lib/python3.8/dist-packages (from rdflib>=6.1.1->sparqlwrapper) (3.0.9) Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from isodate->rdflib>=6.1.1->sparqlwrapper) (1.15.0) Installing collected packages: isodate, rdflib, sparqlwrapper Successfully installed isodate-0.6.1 rdflib-6.2.0 sparqlwrapper-2.0.0
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import json
endpoint_url = "https://query.wikidata.org/sparql"
query = """SELECT DISTINCT ?id ?itemLabel ?exchangeLabel ?tickerLabel ?relationship ?relatedCompany ?relatedCompanyLabel
WHERE {
?id wdt:P31/wdt:P279* wd:Q783794 ; p:P414 [pq:P249 ?ticker; ps:P414 ?exchange ] .
?id ?relationship ?relatedCompany .
?relatedCompany rdfs:label ?relatedCompanyLabel .
?id rdfs:label ?itemLabel .
FILTER ((?relationship = wdt:P355 || ?relationship = wdt:P749 || ?relationship = wdt:P108))
FILTER (lang(?itemLabel) = 'en')
FILTER (lang(?relatedCompanyLabel) = 'en')
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
LIMIT 100"""
def get_results(endpoint_url, query):
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
# TODO adjust user agent; see https://w.wiki/CX6
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
return sparql.query().convert()
results = get_results(endpoint_url, query)
for result in results["results"]["bindings"]:
print(result)
{'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1144929'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Lafarge'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q16995829'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Lafarge Tarmac'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Euronext'}, 'tickerLabel': {'type': 'literal', 'value': 'LG'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q790686'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Avex Group'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3266796'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'cutting edge'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Tokyo Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': '7860'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1344736'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Entertainment One'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q2964928'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Christal Films'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'London Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': 'ETO'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q219203'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'NEC'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P749'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q717318'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Sumitomo Group'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Tokyo Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': '6701'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q207784'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Square Enix'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q105730233'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'SQEX Novel'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'OTC Link ATS - OTC Markets'}, 'tickerLabel': {'type': 'literal', 'value': 'SQNXF'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q170614'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Ryanair'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55165235'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Buzz'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'NASDAQ - All Markets'}, 'tickerLabel': {'type': 'literal', 'value': 'RYAAY'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q170614'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Ryanair'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q472129'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Laudamotion GmbH'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Irish Stock Exchange - All Market'}, 'tickerLabel': {'type': 'literal', 'value': 'RYA'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q217493'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Paradox Interactive'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1433597'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'White Wolf Publishing'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'OTC Link ATS - OTC Markets'}, 'tickerLabel': {'type': 'literal', 'value': 'PRXXF'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q159433'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': '3M'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q30289694'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': '3M (United Kingdom)'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'New York Stock Exchange, Inc.'}, 'tickerLabel': {'type': 'literal', 'value': 'MMM'}} {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q170614'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Ryanair'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55165235'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Buzz'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'London Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': 'RYA'}}
with open('results.json', 'w') as f:
json.dump(results['results']['bindings'], f)
Ok, now we need to transform to the format of Chunk Mappers, which is our Finance NLP Annotator allowing us to store data and map it to, for example, NER entities we have extracted.
with open('results.json', 'r') as f:
company_json = json.load(f)
company_json[:5]
[{'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1144929'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Lafarge'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q16995829'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Lafarge Tarmac'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Euronext'}, 'tickerLabel': {'type': 'literal', 'value': 'LG'}}, {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q790686'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Avex Group'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3266796'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'cutting edge'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Tokyo Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': '7860'}}, {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1344736'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Entertainment One'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q2964928'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Christal Films'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'London Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': 'ETO'}}, {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q219203'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'NEC'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P749'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q717318'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Sumitomo Group'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Tokyo Stock Exchange'}, 'tickerLabel': {'type': 'literal', 'value': '6701'}}, {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q207784'}, 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Square Enix'}, 'relationship': {'type': 'uri', 'value': 'http://www.wikidata.org/prop/direct/P355'}, 'relatedCompany': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q105730233'}, 'relatedCompanyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'SQEX Novel'}, 'exchangeLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'OTC Link ATS - OTC Markets'}, 'tickerLabel': {'type': 'literal', 'value': 'SQNXF'}}]
ChunkMappers need an array of dictionaries.
Each position in the array will be a company. A position in the array should have a key
and relations
.
The key
will be the name of a company. And the relations, a series of has_
relations as has_ticker
, has_stock_exchange
, etc., including all the subsidiary/other companies it's parent of.
data_for_chunkmapper = {'mappings': []}
for t in company_json:
org = t['itemLabel']['value']
c_dict = dict()
c_dict['key'] = org
c_dict['relations'] = list()
#Let's add the relations!
# First, we keep the URI to link the ORG to wikidata
uri = t['id']['value']
c_dict['relations'].append({'key': 'has_uri', 'values': [uri]})
# Then, language
lan = t['itemLabel']['xml:lang']
c_dict['relations'].append({'key': 'has_language', 'values': [lan]})
# Then, ticker
ticker = t['tickerLabel']['value']
c_dict['relations'].append({'key': 'has_ticker', 'values': [ticker]})
# Then, Stock Exchange
exchange = t['exchangeLabel']['value']
c_dict['relations'].append({'key': 'has_exchange', 'values': [exchange]})
# The related Company Label
rcl = t['relatedCompanyLabel']['value']
c_dict['relations'].append({'key': 'related_company', 'values': [rcl]})
# Let's add the dictionary to our array for training the ChunkMapper
data_for_chunkmapper['mappings'].append(c_dict)
Wikidata may return many rows per ORG. For example, imagine several subsidiaries of one company.
Here you will find the code to merge all the subsidiaries in 1 entry (required by the ChunkMapper to be unique).
data_for_chunkmapper = {'mappings': []}
for t in company_json:
org = t['itemLabel']['value']
# Is the entry for ORG new?
entry = next(filter(lambda x: x['key'] == org, data_for_chunkmapper['mappings']), None)
if entry is None:
c_dict = dict()
c_dict['key'] = org
c_dict['relations'] = list()
#Let's add the relations!
# First, we keep the URI to link the ORG to wikidata
uri = t['id']['value']
c_dict['relations'].append({'key': 'has_uri', 'values': [uri]})
# Then, language
lan = t['itemLabel']['xml:lang']
c_dict['relations'].append({'key': 'has_language', 'values': [lan]})
# Then, ticker
ticker = t['tickerLabel']['value']
c_dict['relations'].append({'key': 'has_ticker', 'values': [ticker]})
# Then, Stock Exchange
exchange = t['exchangeLabel']['value']
c_dict['relations'].append({'key': 'has_exchange', 'values': [exchange]})
# The related Company Label
rcl = t['relatedCompanyLabel']['value']
c_dict['relations'].append({'key': 'has_related_company_label', 'values': [rcl]})
# Let's add the dictionary to our array for training the ChunkMapper
data_for_chunkmapper['mappings'].append(c_dict)
else:
# The company entry exists, I just add a new related company
rcu = t['relatedCompany']['value']
for r in entry['relations']:
if r['key'] == 'has_related_company_label':
if rcl not in r['values']:
r['values'].append(rcl)
data_for_chunkmapper
{'mappings': [{'key': 'Lafarge', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q1144929']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['LG']}, {'key': 'has_exchange', 'values': ['Euronext']}, {'key': 'has_related_company_label', 'values': ['Lafarge Tarmac']}]}, {'key': 'Avex Group', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q790686']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['7860']}, {'key': 'has_exchange', 'values': ['Tokyo Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['cutting edge']}]}, {'key': 'Entertainment One', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q1344736']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['ETO']}, {'key': 'has_exchange', 'values': ['London Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['Christal Films']}]}, {'key': 'NEC', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q219203']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['6701']}, {'key': 'has_exchange', 'values': ['Tokyo Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['Sumitomo Group']}]}, {'key': 'Square Enix', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q207784']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['SQNXF']}, {'key': 'has_exchange', 'values': ['OTC Link ATS - OTC Markets']}, {'key': 'has_related_company_label', 'values': ['SQEX Novel']}]}, {'key': 'Ryanair', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q170614']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['RYAAY']}, {'key': 'has_exchange', 'values': ['NASDAQ - All Markets']}, {'key': 'has_related_company_label', 'values': ['Buzz', '3M (United Kingdom)']}]}, {'key': 'Paradox Interactive', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q217493']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['PRXXF']}, {'key': 'has_exchange', 'values': ['OTC Link ATS - OTC Markets']}, {'key': 'has_related_company_label', 'values': ['White Wolf Publishing']}]}, {'key': '3M', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q159433']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['MMM']}, {'key': 'has_exchange', 'values': ['New York Stock Exchange, Inc.']}, {'key': 'has_related_company_label', 'values': ['3M (United Kingdom)']}]}]}
with open('training.json', 'w') as f:
json.dump(data_for_chunkmapper, f)
with open('training.json', 'r') as f:
training_data = json.load(f)
training_data
{'mappings': [{'key': 'Lafarge', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q1144929']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['LG']}, {'key': 'has_exchange', 'values': ['Euronext']}, {'key': 'has_related_company_label', 'values': ['Lafarge Tarmac']}]}, {'key': 'Avex Group', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q790686']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['7860']}, {'key': 'has_exchange', 'values': ['Tokyo Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['cutting edge']}]}, {'key': 'Entertainment One', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q1344736']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['ETO']}, {'key': 'has_exchange', 'values': ['London Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['Christal Films']}]}, {'key': 'NEC', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q219203']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['6701']}, {'key': 'has_exchange', 'values': ['Tokyo Stock Exchange']}, {'key': 'has_related_company_label', 'values': ['Sumitomo Group']}]}, {'key': 'Square Enix', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q207784']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['SQNXF']}, {'key': 'has_exchange', 'values': ['OTC Link ATS - OTC Markets']}, {'key': 'has_related_company_label', 'values': ['SQEX Novel']}]}, {'key': 'Ryanair', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q170614']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['RYAAY']}, {'key': 'has_exchange', 'values': ['NASDAQ - All Markets']}, {'key': 'has_related_company_label', 'values': ['Buzz', '3M (United Kingdom)']}]}, {'key': 'Paradox Interactive', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q217493']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['PRXXF']}, {'key': 'has_exchange', 'values': ['OTC Link ATS - OTC Markets']}, {'key': 'has_related_company_label', 'values': ['White Wolf Publishing']}]}, {'key': '3M', 'relations': [{'key': 'has_uri', 'values': ['http://www.wikidata.org/entity/Q159433']}, {'key': 'has_language', 'values': ['en']}, {'key': 'has_ticker', 'values': ['MMM']}, {'key': 'has_exchange', 'values': ['New York Stock Exchange, Inc.']}, {'key': 'has_related_company_label', 'values': ['3M (United Kingdom)']}]}]}
example_parent_org = training_data['mappings'][0]['key']
example_parent_org
'Lafarge'
all_rels = [x['key'] for x in training_data['mappings'][0]['relations']]
all_rels
['has_uri', 'has_language', 'has_ticker', 'has_exchange', 'has_related_company_label']
from johnsnowlabs import finance
# To allow searches by Levenshtein Distance, we enable Fuzzy Matching
chunkerMapper = finance.ChunkMapperApproach()\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setDictionary("training.json")\
.setEnableFuzzyMatching(True)\
.setRels(all_rels)
empty_dataset = spark.createDataFrame([[]])
fitCM = chunkerMapper.fit(empty_dataset)
fitCM.write().overwrite().save('finmapper_example')
Now, let's see what happens after finding an NER chunk which is in our Wikidata.
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
textSplitter = finance.TextSplitter()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
.setInputCols(["sentence", "token"]) \
.setOutputCol("embeddings")
ner_model = finance.NerModel.pretrained('finner_orgs_prods_alias', 'en', 'finance/models')\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = nlp.NerConverter()\
.setInputCols(["sentence","token","ner"])\
.setOutputCol("ner_chunk")
CM = finance.ChunkMapperModel()\
.load('finmapper_example')\
.setEnableFuzzyMatching(True)\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")
nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
textSplitter,
tokenizer,
embeddings,
ner_model,
ner_converter,
CM
])
text = f"""{example_parent_org} is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear,
apparel, equipment, accessories, and services"""
test_data = spark.createDataFrame([[text]]).toDF("text")
model = nlpPipeline.fit(test_data)
res= model.transform(test_data)
bert_embeddings_sec_bert_base download started this may take some time. Approximate size to download 390.4 MB [OK!] finner_orgs_prods_alias download started this may take some time. [OK!]
res.select('ner_chunk.result', 'mappings.result').show()
+---------+--------------------+ | result| result| +---------+--------------------+ |[Lafarge]|[http://www.wikid...| +---------+--------------------+
collected = res.collect()[0]
for row in collected['mappings']:
print(f"{row['metadata']['entity']} -> {row['metadata']['relation']} -> {row.result}")
Lafarge -> has_uri -> http://www.wikidata.org/entity/Q1144929 Lafarge -> has_language -> en Lafarge -> has_ticker -> LG Lafarge -> has_exchange -> Euronext Lafarge -> has_related_company_label -> Lafarge Tarmac
This could help us get track, query using Graph Engines and visualize our results.
import plotly.graph_objects as go
import random
def get_nodes_from_graph(graph, pos, node_color):
"""Extracts the nodes from a networkX dataframe in Plotly Scatterplot format"""
node_x = []
node_y = []
texts = []
hovers = []
for node in graph.nodes():
entity = graph.nodes[node]['attr_dict']['entity']
x, y = pos[node]
node_x.append(x)
node_y.append(y)
texts.append(node)
hovers.append(entity)
node_trace = go.Scatter(
x=node_x, y=node_y, text=texts, hovertext=hovers,
mode='markers+text',
hoverinfo='text',
marker=dict(
color=node_color,
size=40,
line_width=2))
return node_trace
def get_edges_from_graph(graph, pos, edge_color):
"""Extracts the edges from a networkX dataframe in Plotly Scatterplot format"""
edge_x = []
edge_y = []
hovers = []
xtext = []
ytext = []
for edge in graph.edges():
relation = graph.edges[edge]['attr_dict']['relation']
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.append(x0)
edge_x.append(x1)
edge_x.append(None)
edge_y.append(y0)
edge_y.append(y1)
edge_y.append(None)
hovers.append(relation)
xtext.append((x0+x1)/2)
ytext.append((y0+y1)/2)
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=2, color=edge_color),
mode='lines')
labels_trace = go.Scatter(x=xtext,y= ytext, mode='text',
textfont = {'color': edge_color},
marker_size=0.5,
text=hovers,
textposition='top center',
hovertemplate='weight: %{text}')
return edge_trace, labels_trace
def show_graph_in_plotly(graph, node_color='white', edge_color='grey'):
"""Shows Plotly graph in Databricks"""
pos = nx.spring_layout(graph)
node_trace = get_nodes_from_graph(graph, pos, node_color)
edge_trace, labels_trace = get_edges_from_graph(graph, pos, edge_color)
fig = go.Figure(data=[edge_trace, node_trace, labels_trace],
layout=go.Layout(
title='Visualization',
titlefont_size=16,
showlegend=False,
width=1600,
height=1000,
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
import networkx as nx
G = nx.Graph()
G.clear()
G.nodes()
NodeView(())
for row in collected['mappings']:
from_node = row['metadata']['entity']
to_node = row.result
relation = row['metadata']['relation']
to_node_type = relation.replace('has_', '')
G.add_node(from_node, attr_dict={'entity': 'ORG'}) # Will ignore if exists
G.add_node(to_node, attr_dict={'entity': to_node_type}) # Will ignore if exists
G.add_edge(from_node, to_node, attr_dict={'relation': relation})
show_graph_in_plotly(G)