! pip install -q johnsnowlabs from johnsnowlabs import nlp, finance, viz # nlp.install(force_browser=True) from google.colab import files print('Please Upload your John Snow Labs License using the button below') license_keys = files.upload() nlp.install() spark = nlp.start() def get_generic_base_pipeline(): """Common components used in all pipelines""" document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") text_splitter = finance.TextSplitter()\ .setInputCols(["document"])\ .setOutputCol("sentence") tokenizer = nlp.Tokenizer()\ .setInputCols(["sentence"])\ .setOutputCol("token") embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ .setInputCols(["sentence", "token"])\ .setOutputCol("embeddings") base_pipeline = nlp.Pipeline(stages=[ document_assembler, text_splitter, tokenizer, embeddings ]) return base_pipeline generic_base_pipeline = get_generic_base_pipeline() # Text Classifier def get_text_classification_pipeline(model): """This pipeline allows you to use different classification models to understand if an input text is of a specific class or is something else. It will be used to check where the first summary page of SEC10K is, where the sections of Acquisitions and Subsidiaries are, or where in the document the management roles and experiences are mentioned""" document_assembler = nlp.DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") embeddings = nlp.UniversalSentenceEncoder.pretrained() \ .setInputCols("document") \ .setOutputCol("sentence_embeddings") classifier = nlp.ClassifierDLModel.pretrained(model, "en", "finance/models")\ .setInputCols(["sentence_embeddings"])\ .setOutputCol("category") nlpPipeline = nlp.Pipeline(stages=[ document_assembler, embeddings, classifier]) return nlpPipeline import pandas as pd def get_relations_df (results, col='relations'): """Shows a Dataframe with the relations extracted by Spark NLP""" rel_pairs=[] for rel in results[0][col]: rel_pairs.append(( rel.result, rel.metadata['entity1'], rel.metadata['entity1_begin'], rel.metadata['entity1_end'], rel.metadata['chunk1'], rel.metadata['entity2'], rel.metadata['entity2_begin'], rel.metadata['entity2_end'], rel.metadata['chunk2'], rel.metadata['confidence'] )) rel_df = pd.DataFrame(rel_pairs, columns=['relation','entity1','entity1_begin','entity1_end','chunk1','entity2','entity2_begin','entity2_end','chunk2', 'confidence']) return rel_df !wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/cdns-20220101.html.txt with open('cdns-20220101.html.txt', 'r') as f: cadence_sec10k = f.read() print(cadence_sec10k[:100]) pages = [x for x in cadence_sec10k.split("Table of Contents") if x.strip() != ''] print(pages[0]) # Some examples candidates = [[pages[0]], [pages[1]], [pages[35]], [pages[67]]] classification_pipeline = get_text_classification_pipeline('finclf_acquisitions_item') df = spark.createDataFrame(candidates).toDF("text") model = classification_pipeline.fit(df) result = model.transform(df) result.select('category.result').show() ner_model_date = finance.NerModel.pretrained("finner_sec_dates", "en", "finance/models")\ .setInputCols(["sentence", "token", "embeddings"])\ .setOutputCol("ner_dates") ner_converter_date = nlp.NerConverter()\ .setInputCols(["sentence","token","ner_dates"])\ .setOutputCol("ner_chunk_date") ner_model_org= finance.NerModel.pretrained("finner_orgs_prods_alias", "en", "finance/models")\ .setInputCols(["sentence", "token", "embeddings"])\ .setOutputCol("ner_orgs") ner_converter_org = nlp.NerConverter()\ .setInputCols(["sentence","token","ner_orgs"])\ .setOutputCol("ner_chunk_org")\ chunk_merger = finance.ChunkMergeApproach()\ .setInputCols('ner_chunk_org', "ner_chunk_date")\ .setOutputCol('ner_chunk') pos = nlp.PerceptronModel.pretrained()\ .setInputCols(["sentence", "token"])\ .setOutputCol("pos") dependency_parser = nlp.DependencyParserModel().pretrained("dependency_conllu", "en")\ .setInputCols(["sentence", "pos", "token"])\ .setOutputCol("dependencies") re_filter = finance.RENerChunksFilter()\ .setInputCols(["ner_chunk", "dependencies"])\ .setOutputCol("re_ner_chunk")\ .setRelationPairs(["ORG-ORG", "ORG-DATE"])\ .setMaxSyntacticDistance(10) reDL = finance.RelationExtractionDLModel().pretrained('finre_acquisitions_subsidiaries_md', 'en', 'finance/models')\ .setInputCols(["re_ner_chunk", "sentence"])\ .setOutputCol("relations_acq")\ .setPredictionThreshold(0.1) annotation_merger = finance.AnnotationMerger()\ .setInputCols("relations_acq", "relations_alias")\ .setOutputCol("relations") nlpPipeline = nlp.Pipeline(stages=[ generic_base_pipeline, ner_model_date, ner_converter_date, ner_model_org, ner_converter_org, chunk_merger, pos, dependency_parser, re_filter, reDL, annotation_merger]) empty_data = spark.createDataFrame([[""]]).toDF("text") model = nlpPipeline.fit(empty_data) light_model = nlp.LightPipeline(model) sample_text = pages[67].replace("“", "\"").replace("”", "\"") result = light_model.fullAnnotate(sample_text) rel_df = get_relations_df(result) rel_df rel_df = rel_df[(rel_df["relation"] != "other") & (rel_df["relation"] != "no_rel")] rel_df from sparknlp_display import RelationExtractionVisualizer re_vis = viz.RelationExtractionVisualizer() re_vis.display(result = result[0], relation_col = "relations", document_col = "document", exclude_relations = ["other", "no_rel"], show_relations=True)