! pip install -q johnsnowlabs
Using my.johnsnowlabs.com SSO
from johnsnowlabs import nlp, legal
# nlp.install(force_browser=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
nlp.install()
spark = nlp.start()
Data Augmentation is the process of increase an extracted datapoint with external sources.
For example, let's suppose I work with a document which mentions the company Amazon. We could be talking about stock prices, or some legal litigations, or just a commercial agreement with a provider, among others.
In the document, we can extract a company name using NER as an Organization, but that's all the information available about the company in that document.
Well, with Data Augmentation, we can use external sources, as SEC Edgar, Crunchbase, Nasdaq or even Wikipedia, to enrich the company with much more information, allowing us to take better decisions.
Let's see how to do it.
Here, we will train a ChunkMapper model with 1000 sample
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/sample_openedgar.json
import json
with open('sample_openedgar.json', 'r') as f:
company_json = json.load(f)
company_json['mappings'][8]
{'key': 'AWA Group LP', 'relations': [{'key': 'name', 'values': ['AWA Group LP']}, {'key': 'sic', 'values': ['INVESTMENT ADVICE [6282]']}, {'key': 'sic_code', 'values': [6282, 0]}, {'key': 'irs_number', 'values': [371785232, 0]}, {'key': 'fiscal_year_end', 'values': [630, 1231, 0]}, {'key': 'state_location', 'values': ['NC']}, {'key': 'state_incorporation', 'values': ['DE']}, {'key': 'business_street', 'values': ['116 SOUTH FRANKLIN STREET']}, {'key': 'business_city', 'values': ['ROCKY MOUNT']}, {'key': 'business_state', 'values': ['NC']}, {'key': 'business_zip', 'values': ['27804']}, {'key': 'business_phone', 'values': ['952-446-6678']}, {'key': 'former_name', 'values': ['']}, {'key': 'former_name_date', 'values': ['']}, {'key': 'date', 'values': ['2017-01-23', '2017-03-16', '2016-01-22', '2016-01-19', '2015-06-30', '2016-04-14', '2016-07-27', '2016-10-28', '2015-06-26', '2015-09-02', '2015-09-29', '2015-12-31']}, {'key': 'company_id', 'values': [1645148]}]}
for x in company_json['mappings']:
if 'Rayton Solar Inc.' in x['key']:
print(x)
{'key': 'Rayton Solar Inc.', 'relations': [{'key': 'name', 'values': ['Rayton Solar Inc.']}, {'key': 'sic', 'values': ['SEMICONDUCTORS & RELATED DEVICES [3674]']}, {'key': 'sic_code', 'values': [3674]}, {'key': 'irs_number', 'values': [0]}, {'key': 'fiscal_year_end', 'values': [1231]}, {'key': 'state_location', 'values': ['CA']}, {'key': 'state_incorporation', 'values': ['DE']}, {'key': 'business_street', 'values': ['920 COLORADO AVE.']}, {'key': 'business_city', 'values': ['SANTA MONICA']}, {'key': 'business_state', 'values': ['CA']}, {'key': 'business_zip', 'values': ['90401']}, {'key': 'business_phone', 'values': ['(661) 259-4786']}, {'key': 'former_name', 'values': ['']}, {'key': 'former_name_date', 'values': ['']}, {'key': 'date', 'values': ['2017-01-10', '2017-01-20', '2017-01-06', '2017-05-15', '2017-09-28', '2016-11-29', '2016-12-20', '2016-12-22', '2022-09-21', '2019-06-27', '2018-03-22', '2018-04-30', '2018-12-10', '2021-09-22', '2020-06-08', '2020-09-28']}, {'key': 'company_id', 'values': [1654124]}]}
all_rels = [x['key'] for x in company_json['mappings'][0]['relations']]
all_rels
['name', 'sic', 'sic_code', 'irs_number', 'fiscal_year_end', 'state_location', 'state_incorporation', 'business_street', 'business_city', 'business_state', 'business_zip', 'business_phone', 'former_name', 'former_name_date', 'date', 'company_id']
chunkerMapper = legal.ChunkMapperApproach()\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setDictionary("sample_openedgar.json")\
.setRels(all_rels)
empty_dataset = spark.createDataFrame([[""]]).toDF("text")
fit_CM = chunkerMapper.fit(empty_dataset)
# Save model
fit_CM.write().overwrite().save('openedgar_2000_2022_company_mapper')
text = [""" AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price. """]
We get compnay name from sample text
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
.setInputCols(["sentence", "token"]) \
.setOutputCol("embeddings")
ner_model = legal.NerModel.pretrained("legner_org_per_role_date", "en", "legal/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = nlp.NerConverter()\
.setInputCols(["sentence","token","ner"])\
.setOutputCol("ner_chunk")\
.setWhiteList(["ORG"]) # Return only ORG entities
nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
embeddings,
ner_model,
ner_converter])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
light_model = nlp.LightPipeline(model)
sentence_detector_dl download started this may take some time. Approximate size to download 514.9 KB [OK!] bert_embeddings_sec_bert_base download started this may take some time. Approximate size to download 390.4 MB [OK!] legner_org_per_role_date download started this may take some time. [OK!]
# We get company name from sample text
ner_result = light_model.fullAnnotate(text)
ner_result
[{'document': [Annotation(document, 0, 129, AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price. , {})], 'ner_chunk': [Annotation(chunk, 1, 12, AWA Group LP, {'entity': 'ORG', 'sentence': '0', 'chunk': '0', 'confidence': '0.9788'})], 'token': [Annotation(token, 1, 3, AWA, {'sentence': '0'}), Annotation(token, 5, 9, Group, {'sentence': '0'}), Annotation(token, 11, 12, LP, {'sentence': '0'}), Annotation(token, 14, 20, intends, {'sentence': '0'}), Annotation(token, 22, 23, to, {'sentence': '0'}), Annotation(token, 25, 27, pay, {'sentence': '0'}), Annotation(token, 29, 37, dividends, {'sentence': '0'}), Annotation(token, 39, 40, on, {'sentence': '0'}), Annotation(token, 42, 44, the, {'sentence': '0'}), Annotation(token, 46, 51, Common, {'sentence': '0'}), Annotation(token, 53, 57, Units, {'sentence': '0'}), Annotation(token, 59, 60, on, {'sentence': '0'}), Annotation(token, 62, 62, a, {'sentence': '0'}), Annotation(token, 64, 72, quarterly, {'sentence': '0'}), Annotation(token, 74, 78, basis, {'sentence': '0'}), Annotation(token, 80, 81, at, {'sentence': '0'}), Annotation(token, 83, 84, an, {'sentence': '0'}), Annotation(token, 86, 91, annual, {'sentence': '0'}), Annotation(token, 93, 96, rate, {'sentence': '0'}), Annotation(token, 98, 99, of, {'sentence': '0'}), Annotation(token, 101, 105, 8.00%, {'sentence': '0'}), Annotation(token, 107, 108, of, {'sentence': '0'}), Annotation(token, 110, 112, the, {'sentence': '0'}), Annotation(token, 114, 121, Offering, {'sentence': '0'}), Annotation(token, 123, 127, Price, {'sentence': '0'}), Annotation(token, 128, 128, ., {'sentence': '0'})], 'ner': [Annotation(named_entity, 1, 3, B-ORG, {'word': 'AWA', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 5, 9, I-ORG, {'word': 'Group', 'confidence': '0.9371', 'sentence': '0'}), Annotation(named_entity, 11, 12, I-ORG, {'word': 'LP', 'confidence': '0.9993', 'sentence': '0'}), Annotation(named_entity, 14, 20, O, {'word': 'intends', 'confidence': '0.9983', 'sentence': '0'}), Annotation(named_entity, 22, 23, O, {'word': 'to', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 25, 27, O, {'word': 'pay', 'confidence': '0.9992', 'sentence': '0'}), Annotation(named_entity, 29, 37, O, {'word': 'dividends', 'confidence': '0.9991', 'sentence': '0'}), Annotation(named_entity, 39, 40, O, {'word': 'on', 'confidence': '0.999', 'sentence': '0'}), Annotation(named_entity, 42, 44, O, {'word': 'the', 'confidence': '0.9993', 'sentence': '0'}), Annotation(named_entity, 46, 51, O, {'word': 'Common', 'confidence': '0.9864', 'sentence': '0'}), Annotation(named_entity, 53, 57, O, {'word': 'Units', 'confidence': '0.961', 'sentence': '0'}), Annotation(named_entity, 59, 60, O, {'word': 'on', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 62, 62, O, {'word': 'a', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 64, 72, O, {'word': 'quarterly', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 74, 78, O, {'word': 'basis', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 80, 81, O, {'word': 'at', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 83, 84, O, {'word': 'an', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 86, 91, O, {'word': 'annual', 'confidence': '1.0', 'sentence': '0'}), Annotation(named_entity, 93, 96, O, {'word': 'rate', 'confidence': '0.9995', 'sentence': '0'}), Annotation(named_entity, 98, 99, O, {'word': 'of', 'confidence': '0.9988', 'sentence': '0'}), Annotation(named_entity, 101, 105, O, {'word': '8.00%', 'confidence': '0.998', 'sentence': '0'}), Annotation(named_entity, 107, 108, O, {'word': 'of', 'confidence': '0.9996', 'sentence': '0'}), Annotation(named_entity, 110, 112, O, {'word': 'the', 'confidence': '0.9999', 'sentence': '0'}), Annotation(named_entity, 114, 121, O, {'word': 'Offering', 'confidence': '0.9987', 'sentence': '0'}), Annotation(named_entity, 123, 127, O, {'word': 'Price', 'confidence': '0.9873', 'sentence': '0'}), Annotation(named_entity, 128, 128, O, {'word': '.', 'confidence': '0.9999', 'sentence': '0'})], 'embeddings': [Annotation(word_embeddings, 1, 3, AWA, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'AWA', 'sentence': '0'}), Annotation(word_embeddings, 5, 9, Group, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'Group', 'sentence': '0'}), Annotation(word_embeddings, 11, 12, LP, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'LP', 'sentence': '0'}), Annotation(word_embeddings, 14, 20, intends, {'isOOV': 'false', 'pieceId': '4255', 'isWordStart': 'true', 'token': 'intends', 'sentence': '0'}), Annotation(word_embeddings, 22, 23, to, {'isOOV': 'false', 'pieceId': '631', 'isWordStart': 'true', 'token': 'to', 'sentence': '0'}), Annotation(word_embeddings, 25, 27, pay, {'isOOV': 'false', 'pieceId': '936', 'isWordStart': 'true', 'token': 'pay', 'sentence': '0'}), Annotation(word_embeddings, 29, 37, dividends, {'isOOV': 'false', 'pieceId': '1919', 'isWordStart': 'true', 'token': 'dividends', 'sentence': '0'}), Annotation(word_embeddings, 39, 40, on, {'isOOV': 'false', 'pieceId': '666', 'isWordStart': 'true', 'token': 'on', 'sentence': '0'}), Annotation(word_embeddings, 42, 44, the, {'isOOV': 'false', 'pieceId': '612', 'isWordStart': 'true', 'token': 'the', 'sentence': '0'}), Annotation(word_embeddings, 46, 51, Common, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'Common', 'sentence': '0'}), Annotation(word_embeddings, 53, 57, Units, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'Units', 'sentence': '0'}), Annotation(word_embeddings, 59, 60, on, {'isOOV': 'false', 'pieceId': '666', 'isWordStart': 'true', 'token': 'on', 'sentence': '0'}), Annotation(word_embeddings, 62, 62, a, {'isOOV': 'false', 'pieceId': '143', 'isWordStart': 'true', 'token': 'a', 'sentence': '0'}), Annotation(word_embeddings, 64, 72, quarterly, {'isOOV': 'false', 'pieceId': '2181', 'isWordStart': 'true', 'token': 'quarterly', 'sentence': '0'}), Annotation(word_embeddings, 74, 78, basis, {'isOOV': 'false', 'pieceId': '1277', 'isWordStart': 'true', 'token': 'basis', 'sentence': '0'}), Annotation(word_embeddings, 80, 81, at, {'isOOV': 'false', 'pieceId': '746', 'isWordStart': 'true', 'token': 'at', 'sentence': '0'}), Annotation(word_embeddings, 83, 84, an, {'isOOV': 'false', 'pieceId': '620', 'isWordStart': 'true', 'token': 'an', 'sentence': '0'}), Annotation(word_embeddings, 86, 91, annual, {'isOOV': 'false', 'pieceId': '1207', 'isWordStart': 'true', 'token': 'annual', 'sentence': '0'}), Annotation(word_embeddings, 93, 96, rate, {'isOOV': 'false', 'pieceId': '1072', 'isWordStart': 'true', 'token': 'rate', 'sentence': '0'}), Annotation(word_embeddings, 98, 99, of, {'isOOV': 'false', 'pieceId': '619', 'isWordStart': 'true', 'token': 'of', 'sentence': '0'}), Annotation(word_embeddings, 101, 105, 8.00%, {'isOOV': 'false', 'pieceId': '128', 'isWordStart': 'true', 'token': '8.00%', 'sentence': '0'}), Annotation(word_embeddings, 107, 108, of, {'isOOV': 'false', 'pieceId': '619', 'isWordStart': 'true', 'token': 'of', 'sentence': '0'}), Annotation(word_embeddings, 110, 112, the, {'isOOV': 'false', 'pieceId': '612', 'isWordStart': 'true', 'token': 'the', 'sentence': '0'}), Annotation(word_embeddings, 114, 121, Offering, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'Offering', 'sentence': '0'}), Annotation(word_embeddings, 123, 127, Price, {'isOOV': 'false', 'pieceId': '101', 'isWordStart': 'true', 'token': 'Price', 'sentence': '0'}), Annotation(word_embeddings, 128, 128, ., {'isOOV': 'false', 'pieceId': '118', 'isWordStart': 'true', 'token': '.', 'sentence': '0'})], 'sentence': [Annotation(document, 1, 128, AWA Group LP intends to pay dividends on the Common Units on a quarterly basis at an annual rate of 8.00% of the Offering Price., {'sentence': '0'})]}]
ORG = ner_result[0]["ner_chunk"][0].result
ORG
'AWA Group LP'
embeddings = nlp.UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
.setInputCols("document") \
.setOutputCol("sentence_embeddings")
resolver = legal.SentenceEntityResolverModel.pretrained("legel_edgar_company_name", "en", "legal/models")\
.setInputCols(["sentence_embeddings"]) \
.setOutputCol("resolution")\
.setDistanceFunction("EUCLIDEAN")
pipelineModel = nlp.PipelineModel(
stages = [
documentAssembler,
embeddings,
resolver])
lp_res = nlp.LightPipeline(pipelineModel)
tfhub_use download started this may take some time. Approximate size to download 923.7 MB [OK!] legel_edgar_company_name download started this may take some time. [OK!]
# We normalize company name
el_res = lp_res.annotate(ORG)
el_res
{'document': ['AWA Group LP'], 'sentence_embeddings': ['AWA Group LP'], 'resolution': ['AWA Group LP']}
NORM_ORG = el_res["resolution"][0]
NORM_ORG
'AWA Group LP'
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
chunkAssembler = nlp.Doc2Chunk() \
.setInputCols("document") \
.setOutputCol("chunk") \
.setIsArray(False)
CM = legal.ChunkMapperModel().load("openedgar_2000_2022_company_mapper")\
.setInputCols(["chunk"])\
.setOutputCol("mappings")
cm_pipeline = nlp.Pipeline(stages=[documentAssembler,
chunkAssembler,
CM])
fit_cm_pipeline = cm_pipeline.fit(empty_data)
# LightPipelines don't support Doc2Chunk, so we will use here usual transform
df = spark.createDataFrame([[NORM_ORG]]).toDF("text")
df.show()
+------------+ | text| +------------+ |AWA Group LP| +------------+
res = fit_cm_pipeline.transform(df)
res.show()
+------------+--------------------+--------------------+--------------------+ | text| document| chunk| mappings| +------------+--------------------+--------------------+--------------------+ |AWA Group LP|[{document, 0, 11...|[{chunk, 0, 11, A...|[{labeled_depende...| +------------+--------------------+--------------------+--------------------+
res.select("mappings.result").show(truncate=False)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ |result | +----------------------------------------------------------------------------------------------------------------------------------------------------------------+ |[AWA Group LP, INVESTMENT ADVICE [6282], 6282, 371785232, 630, NC, DE, 116 SOUTH FRANKLIN STREET, ROCKY MOUNT, NC, 27804, 952-446-6678, , , 2017-01-23, 1645148]| +----------------------------------------------------------------------------------------------------------------------------------------------------------------+
r = res.select("mappings").collect()
r
[Row(mappings=[Row(annotatorType='labeled_dependency', begin=0, end=11, result='AWA Group LP', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'name', 'entity': 'AWA Group LP', 'relation': 'name'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='INVESTMENT ADVICE [6282]', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'sic', 'entity': 'AWA Group LP', 'relation': 'sic'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='6282', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '0', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'sic_code', 'entity': 'AWA Group LP', 'relation': 'sic_code'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='371785232', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '0', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'irs_number', 'entity': 'AWA Group LP', 'relation': 'irs_number'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='630', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '1231:::0', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'fiscal_year_end', 'entity': 'AWA Group LP', 'relation': 'fiscal_year_end'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='NC', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'state_location', 'entity': 'AWA Group LP', 'relation': 'state_location'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='DE', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'state_incorporation', 'entity': 'AWA Group LP', 'relation': 'state_incorporation'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='116 SOUTH FRANKLIN STREET', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'business_street', 'entity': 'AWA Group LP', 'relation': 'business_street'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='ROCKY MOUNT', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'business_city', 'entity': 'AWA Group LP', 'relation': 'business_city'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='NC', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'business_state', 'entity': 'AWA Group LP', 'relation': 'business_state'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='27804', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'business_zip', 'entity': 'AWA Group LP', 'relation': 'business_zip'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='952-446-6678', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'business_phone', 'entity': 'AWA Group LP', 'relation': 'business_phone'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'former_name', 'entity': 'AWA Group LP', 'relation': 'former_name'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'former_name_date', 'entity': 'AWA Group LP', 'relation': 'former_name_date'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='2017-01-23', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '2017-03-16:::2016-01-22:::2016-01-19:::2015-06-30:::2016-04-14:::2016-07-27:::2016-10-28:::2015-06-26:::2015-09-02:::2015-09-29:::2015-12-31', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'date', 'entity': 'AWA Group LP', 'relation': 'date'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=11, result='1645148', metadata={'sentence': '0', 'ops': '0.0', 'distance': '-2.220446049250313E-16', 'all_relations': '', 'chunk': '0', '__trained__': 'AWA Group LP', '__distance_function__': 'cosine', '__relation_name__': 'company_id', 'entity': 'AWA Group LP', 'relation': 'company_id'}, embeddings=[])])]
json_dict = dict()
for n in r[0]['mappings']:
json_dict[n.metadata['relation']] = str(n.result)
import json
print(json.dumps(json_dict, indent=4, sort_keys=True))
{ "business_city": "ROCKY MOUNT", "business_phone": "952-446-6678", "business_state": "NC", "business_street": "116 SOUTH FRANKLIN STREET", "business_zip": "27804", "company_id": "1645148", "date": "2017-01-23", "fiscal_year_end": "630", "former_name": "", "former_name_date": "", "irs_number": "371785232", "name": "AWA Group LP", "sic": "INVESTMENT ADVICE [6282]", "sic_code": "6282", "state_incorporation": "DE", "state_location": "NC" }