! pip install -q johnsnowlabs
Using my.johnsnowlabs.com SSO
from johnsnowlabs import *
# nlp.install(force_browser=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
nlp.install()
from johnsnowlabs import *
spark = nlp.start()
▒▒▒▒▒▒▒▒▒▒ 100% ᴄᴏᴍᴘʟᴇᴛᴇ!
import requests
URL = "https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/commercial_lease_1.txt"
URL_2 = "https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/commercial_lease_2.txt"
URL_3 = "https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/credit_agreement_2.txt"
URL_4 = "https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/loan_agreement.txt"
response = requests.get(URL)
response2 = requests.get(URL_2)
response3 = requests.get(URL_3)
response4 = requests.get(URL_4)
commercial_lease = response.content.decode('utf-8')
commercial_lease_2 = response2.content.decode('utf-8')
credit_agreement = response3.content.decode('utf-8')
loan_agreement = response4.content.decode('utf-8')
documents = [commercial_lease,credit_agreement,loan_agreement,commercial_lease_2]
documents = [[i] for i in documents]
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en")\
.setInputCols("document")\
.setOutputCol("sentence_embeddings")
doc_classifier = legal.ClassifierDLModel.pretrained("legclf_commercial_lease", "en", "legal/models")\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("category")
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
embeddings,
doc_classifier])
df = spark.createDataFrame(documents).toDF("text")
model = nlpPipeline.fit(df)
result = model.transform(df)
result.select('category.result').show(truncate=False)
sent_bert_base_cased download started this may take some time. Approximate size to download 389.1 MB [OK!] legclf_commercial_lease download started this may take some time. [OK!] +------------------+ |result | +------------------+ |[commercial-lease]| |[other] | |[other] | |[commercial-lease]| +------------------+
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en")\
.setInputCols("document")\
.setOutputCol("sentence_embeddings")
doc_classifier = legal.ClassifierDLModel.pretrained("legclf_loan_agreement_bert", "en", "legal/models")\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("category")
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
embeddings,
doc_classifier])
df = spark.createDataFrame(documents).toDF("text")
model = nlpPipeline.fit(df)
result = model.transform(df)
result.select('category.result').show(truncate=False)
sent_bert_base_cased download started this may take some time. Approximate size to download 389.1 MB [OK!] legclf_loan_agreement_bert download started this may take some time. [OK!] +----------------+ |result | +----------------+ |[other] | |[other] | |[loan-agreement]| |[other] | +----------------+
📜Explanation:
.setCustomBounds(["\r\n"])
sets an array of regular expression(s) to tell the annotator how to split the document. (Here we are splitting by paragraph.).setUseCustomBoundsOnly(True)
the default behaviour of SentenceDetector is Sentence Splitting, so we set to ignore the default regex ('\n', ...)..setExplodeSentences(True)
creates one new row in the dataframe per split.document_assembler = nlp.DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
text_splitter = legal.TextSplitter() \
.setInputCols(["document"]) \
.setOutputCol("pages")\
.setCustomBounds(["\r\n\r\n "])\
.setUseCustomBoundsOnly(True)\
.setExplodeSentences(True)
nlp_pipeline = nlp.Pipeline(stages=[
document_assembler,
text_splitter])
sdf = spark.createDataFrame([[commercial_lease]]).toDF("text")
fit = nlp_pipeline.fit(sdf)
lp = nlp.LightPipeline(fit)
res = lp.annotate(commercial_lease_2)
pages = res['pages']
pages = [p for p in pages if p.strip() != ''] # We remove empty pages
len(pages)
87
embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en")\
.setInputCols("document")\
.setOutputCol("sentence_embeddings")
doc_classifier = legal.ClassifierDLModel.pretrained("legclf_introduction_clause", "en", "legal/models")\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("category")
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
embeddings,
doc_classifier])
texts = [[i] for i in pages]
df = spark.createDataFrame(texts).toDF("text")
model = nlpPipeline.fit(df)
result = model.transform(df)
result.select('category.result').show()
sent_bert_base_cased download started this may take some time. Approximate size to download 389.1 MB [OK!] legclf_introduction_clause download started this may take some time. [OK!] +--------------+ | result| +--------------+ |[introduction]| | [other]| |[introduction]| |[introduction]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| | [other]| |[introduction]| |[introduction]| | [other]| | [other]| +--------------+ only showing top 20 rows
introductory_clause = result.select('text').filter("category.result[0] != 'other'").collect()
print(introductory_clause[1][0])
THIS Lease Agreement , is made and entered into this _____day of May, 2006 by and between Global, Inc., (hereinafter called "Landlord"), and IMI Global, Inc., with a mailing address of ___, (hereinafter referred as "Tenant").
Spark NLP provides pre-trained pipelines that have already been fitted with specific annotators and transformers for various use cases, so you don't have to create a pipeline from scratch. If you need to adjust the parameters of the Relation Extraction model, you can utilize the aforementioned Relation Extraction pipeline.
introductory
for detecting the entities(NER) using and Introductory Clause specific NER and then mapping the relations between them.¶legal_pipeline = nlp.PretrainedPipeline("legpipe_ner_contract_doc_parties_alias_former", "en", "legal/models")
text = [introductory_clause[1][0]]
sdf = spark.createDataFrame([text]).toDF("text")
df = legal_pipeline.transform(sdf)
result = legal_pipeline.fullAnnotate(text)[0]
result.keys()
from johnsnowlabs import viz
ner_viz = viz.NerVisualizer()
ner_viz.display(result, label_col='ner_chunk')
from sparknlp.pretrained import PretrainedPipeline
pipeline = nlp.PretrainedPipeline("legpipe_re_contract_doc_parties_alias", "en", "legal/models")
legpipe_re_contract_doc_parties_alias download started this may take some time. Approx size to download 868 MB [OK!]
import pandas as pd
def get_relations_df (results, col='relations'):
"""Shows a Dataframe with the relations extracted by Spark NLP"""
rel_pairs=[]
for rel in results[0][col]:
rel_pairs.append((
rel.result,
rel.metadata['entity1'],
rel.metadata['entity1_begin'],
rel.metadata['entity1_end'],
rel.metadata['chunk1'],
rel.metadata['entity2'],
rel.metadata['entity2_begin'],
rel.metadata['entity2_end'],
rel.metadata['chunk2'],
rel.metadata['confidence']
))
rel_df = pd.DataFrame(rel_pairs, columns=['relation','entity1','entity1_begin','entity1_end','chunk1','entity2','entity2_begin','entity2_end','chunk2', 'confidence'])
return rel_df
result = pipeline.fullAnnotate(text)
rel_df = get_relations_df(result)
rel_df[rel_df["relation"] != "other"]
relation | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_begin | entity2_end | chunk2 | confidence | |
---|---|---|---|---|---|---|---|---|---|---|
0 | dated_as | DOC | 0 | 19 | THIS Lease Agreement | EFFDATE | 62 | 73 | of May, 2006 | 0.9999546 |
1 | signed_by | DOC | 0 | 19 | THIS Lease Agreement | PARTY | 90 | 100 | Global, Inc | 0.9911765 |
2 | has_alias | PARTY | 90 | 100 | Global, Inc | ALIAS | 125 | 132 | Landlord | 0.9999889 |
3 | has_alias | PARTY | 141 | 155 | IMI Global, Inc | ALIAS | 216 | 221 | Tenant | 0.9999893 |
from sparknlp_display import RelationExtractionVisualizer
re_vis = viz.RelationExtractionVisualizer()
re_vis.display(result = result[0], relation_col = "relations", document_col = "document", exclude_relations = ["other"], show_relations=True)
relations
from the entities.¶For more information look at the models hub page of the model:
https://nlp.johnsnowlabs.com/models?q=legre_contract_doc_parties&task=Relation+Extraction
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
textSplitter = legal.TextSplitter()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols("sentence")\
.setOutputCol("token")
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings")\
pos_tagger = nlp.PerceptronModel()\
.pretrained("pos_clinical", "en", "clinical/models") \
.setInputCols(["sentence", "token"])\
.setOutputCol("pos_tags")
dependency_parser = nlp.DependencyParserModel()\
.pretrained("dependency_conllu", "en")\
.setInputCols(["sentence", "pos_tags", "token"])\
.setOutputCol("dependencies")
ner_model = legal.NerModel.pretrained('legner_contract_doc_parties_lg', 'en', 'legal/models')\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner1")
ner_converter = nlp.NerConverter()\
.setInputCols(["sentence","token","ner1"])\
.setOutputCol("ner_chunks")
re_ner_chunk_filter = legal.RENerChunksFilter() \
.setInputCols(["ner_chunks", "dependencies"])\
.setOutputCol("re_ner_chunks")\
.setMaxSyntacticDistance(7)\
.setRelationPairs(["DOC-EFFDATE", "DOC-PARTY", "PARTY-FORMER_NAME", "ALIAS-PARTY", "PARTY-ALIAS"])
reDL = legal.RelationExtractionDLModel().pretrained('legre_contract_doc_parties_lg', 'en', 'legal/models')\
.setPredictionThreshold(0.5)\
.setInputCols(["re_ner_chunks", "sentence"])\
.setOutputCol("relations")
nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
textSplitter,
tokenizer,
embeddings,
pos_tagger,
dependency_parser,
ner_model,
ner_converter,
re_ner_chunk_filter,
reDL
])
text = introductory_clause[1][0]
empty_df = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_df)
sdf = spark.createDataFrame([[text]]).toDF("text")
res = model.transform(sdf)
res.show(20,truncate=False)
import pyspark.sql.functions as F
result_df = res.select(F.explode(F.arrays_zip(res.relations.result,
res.relations.metadata)).alias("cols")) \
.select(
F.expr("cols['0']").alias("relations"),\
F.expr("cols['1']['entity1']").alias("relations_entity1"),\
F.expr("cols['1']['chunk1']" ).alias("relations_chunk1" ),\
F.expr("cols['1']['entity2']").alias("relations_entity2"),\
F.expr("cols['1']['chunk2']" ).alias("relations_chunk2" ),\
F.expr("cols['1']['confidence']" ).alias("confidence" ),\
F.expr("cols['1']['syntactic_distance']" ).alias("syntactic_distance" ),\
).filter("relations!='other'")
result_df.show()
+---------+-----------------+--------------------+-----------------+----------------+----------+------------------+ |relations|relations_entity1| relations_chunk1|relations_entity2|relations_chunk2|confidence|syntactic_distance| +---------+-----------------+--------------------+-----------------+----------------+----------+------------------+ | dated_as| DOC|THIS Lease Agreement| EFFDATE| of May, 2006| 0.9999546| 6| |signed_by| DOC|THIS Lease Agreement| PARTY| Global, Inc| 0.9911765| 7| |has_alias| PARTY| Global, Inc| ALIAS| Landlord| 0.9999889| 4| |has_alias| PARTY| IMI Global, Inc| ALIAS| Tenant| 0.9999893| 4| +---------+-----------------+--------------------+-----------------+----------------+----------+------------------+
light_model = nlp.LightPipeline(model)
result = light_model.fullAnnotate(text)
# from sparknlp_display import RelationExtractionVisualizer
re_vis = viz.RelationExtractionVisualizer()
re_vis.display(result = result[0],
relation_col = "relations",
document_col = "document",
exclude_relations = ["no_rel"],
show_relations=True
)
import json
alias = {
"entity": "ALIAS",
"ruleScope": "document",
"completeMatchRegex": "true",
"regex":'".*?"',
"matchScope": "sub-token",
"contextLength": 100
}
with open('alias.json', 'w') as f:
json.dump(alias, f)
alias_2 = {
"entity": "ALIAS",
"ruleScope": "document",
"completeMatchRegex": "true",
"regex":'\("(.*?)"\)',
"matchScope": "sub-token",
"contextLength": 100
}
with open('alias_2.json', 'w') as f:
json.dump(alias_2, f)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
textSplitter = legal.TextSplitter()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols("sentence")\
.setOutputCol("token")
alias_parser = legal.ContextualParserApproach() \
.setInputCols(["document", "token"]) \
.setOutputCol("subheader")\
.setJsonPath("alias.json") \
.setPrefixAndSuffixMatch(False)\
.setOptionalContextRules(True)\
.setCaseSensitive(False)
alias_parser2 = legal.ContextualParserApproach() \
.setInputCols(["document", "token"]) \
.setOutputCol("subheader2")\
.setJsonPath("alias_2.json") \
.setCaseSensitive(True) \
.setPrefixAndSuffixMatch(False)\
.setOptionalContextRules(False)
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings")\
ner_model = legal.NerModel.pretrained('legner_contract_doc_parties_lg', 'en', 'legal/models')\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = legal.NerConverterInternal()\
.setInputCols(["sentence","token","ner"])\
.setThreshold(0.7)\
.setOutputCol("ner_chunk")
zero_shot_ner = legal.ZeroShotNerModel.pretrained("legner_roberta_zeroshot", "en", "legal/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("zero_shot_ner")\
.setPredictionThreshold(0.3)\
.setEntityDefinitions(
{
"PARTY": ["which Inc?", "Which Ltd?","Which company?","Which party?"],
"EFFDATE": ["What is the date?"],
"ALIAS": ["Where is the location?","What Aliases are used to refer to the PARTY?","What Aliases are used to refer to the effdate?","What Aliases are used to refer to the DOC?"],
"FORMER_NAME": ['Formerly known as?'],
"ADDRESS":["What is the full location?","where is the address?","Where is the principal location of business?"],
"DOC":["What agreement?"]
})
ner_converter_zeroshot = legal.NerConverterInternal()\
.setInputCols(["sentence", "token", "zero_shot_ner"])\
.setOutputCol("ner_chunk_zeroshot")\
.setGreedyMode(True)
chunk_merger = legal.ChunkMergeApproach()\
.setInputCols("ner_chunk", "ner_chunk_zeroshot", "subheader", "subheader2")\
.setOutputCol('merged_ner_chunks')
nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
textSplitter,
tokenizer,
alias_parser,
alias_parser2,
embeddings,
ner_model,
ner_converter,
zero_shot_ner,
ner_converter_zeroshot,
chunk_merger
])
roberta_embeddings_legal_roberta_base download started this may take some time. Approximate size to download 447.2 MB [OK!] legner_contract_doc_parties_lg download started this may take some time. [OK!] legner_roberta_zeroshot download started this may take some time. [OK!]
from pyspark.sql.types import StructType,StructField, StringType
p_model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
lp = nlp.LightPipeline(p_model)
# from sparknlp_display import NerVisualizer
visualiser = nlp.viz.NerVisualizer()
lp_res_1 = lp.fullAnnotate(text)
visualiser.display(lp_res_1[0], label_col='merged_ner_chunks', document_col='document')
from pyspark.sql import functions as F
df = spark.createDataFrame([[text]]).toDF("text")
result = p_model.transform(df)
result.select(F.explode(F.arrays_zip(result.merged_ner_chunks.result, result.merged_ner_chunks.metadata)).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)
+----------------------+---------+ |chunk |ner_label| +----------------------+---------+ |THIS Lease Agreement |DOC | |of May, 2006 |EFFDATE | |Global, Inc |PARTY | |"Landlord" |ALIAS | |IMI Global, Inc |PARTY | |mailing address of ___|ADDRESS | |"Tenant" |ALIAS | +----------------------+---------+