# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
from johnsnowlabs import nlp, finance
# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
nlp.install()
from johnsnowlabs import nlp, finance
# Automatically load license data and start a session with all jars user has access to
spark = nlp.start()
from pyspark.sql.types import StructType, IntegerType, StringType
import pyspark.sql.functions as F
New Annotator that transforms chunks Dates to a normalized Date with format YYYY/MM/DD. This annotator identifies dates in chunk annotations and transforms those dates to the format YYYY/MM/DD.
We will create texts containing dates in different formats:
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base", "en") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings")\
.setMaxSentenceLength(512)\
.setCaseSensitive(True)
ner_model = finance.NerModel.pretrained('finner_deid_sec', 'en', 'finance/models')\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = finance.NerConverterInternal()\
.setInputCols(["sentence","token","ner"])\
.setOutputCol("date_chunk")\
.setWhiteList(["DATE"])\
date_normalizer = finance.DateNormalizer()\
.setInputCols('date_chunk')\
.setOutputCol('normalized_date')
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
embeddings,
ner_model,
ner_converter,
date_normalizer])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
bert_embeddings_sec_bert_base download started this may take some time. Approximate size to download 390.4 MB [OK!] finner_deid_sec download started this may take some time. [OK!]
dates = [
"""The bond's maturity date is set for Jan 31, 2024, at which point the principal amount will be repaid to the bondholders.""",
"""As of the close of business on October 5, 2023, the stock price of XYZ Company reached a new all-time high of $150 per share, reflecting a 20% increase from the previous day's closing price.""",
"""On January 15th, 2023, the company reported a record-breaking revenue of $10 million for the fourth quarter.""",
"""The annual shareholders' meeting is scheduled for 31/05/2023, where the company's financial performance will be discussed.""",
"""The stock market experienced a significant downturn on 21June2022, causing a sharp decline in investors' portfolios.""",
"""The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 12/2023."""
]
df_dates = spark.createDataFrame(dates,StringType()).toDF('text')
df_dates.show(truncate=100)
+----------------------------------------------------------------------------------------------------+ | text| +----------------------------------------------------------------------------------------------------+ |The bond's maturity date is set for Jan 31, 2024, at which point the principal amount will be rep...| |As of the close of business on October 5, 2023, the stock price of XYZ Company reached a new all-...| |On January 15th, 2023, the company reported a record-breaking revenue of $10 million for the four...| |The annual shareholders' meeting is scheduled for 31/05/2023, where the company's financial perfo...| |The stock market experienced a significant downturn on 21June2022, causing a sharp decline in inv...| | The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 12/2023.| +----------------------------------------------------------------------------------------------------+
result = model.transform(df_dates)
We are going to show how the date is normalized.
result_df = result.select("text",F.explode(F.arrays_zip(result.date_chunk.result, result.normalized_date.result)).alias("cols")) \
.select("text",F.expr("cols['0']").alias("date_chunk"),
F.expr("cols['1']").alias("normalized_date"))
result_df.show(truncate=150)
+------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+---------------+ | text| date_chunk|normalized_date| +------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+---------------+ | The bond's maturity date is set for Jan 31, 2024, at which point the principal amount will be repaid to the bondholders.| Jan 31, 2024| 2024/01/31| |As of the close of business on October 5, 2023, the stock price of XYZ Company reached a new all-time high of $150 per share, reflecting a 20% incr...| October 5, 2023| 2023/10/05| | On January 15th, 2023, the company reported a record-breaking revenue of $10 million for the fourth quarter.|January 15th, 2023| 2023/01/15| | The annual shareholders' meeting is scheduled for 31/05/2023, where the company's financial performance will be discussed.| 31/05/2023| 2023/05/31| | The stock market experienced a significant downturn on 21June2022, causing a sharp decline in investors' portfolios.| 21June2022| 2022/06/21| | The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 12/2023.| 12/2023| 2023/12/15| +------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+---------------+
date_normalizer = finance.DateNormalizer()\
.setInputCols('date_chunk')\
.setOutputCol('normalized_date')
replacer = finance.Replacer()\
.setInputCols(["normalized_date","sentence"])\
.setOutputCol("replaced_document")
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
embeddings,
ner_model,
ner_converter,
date_normalizer,
replacer])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
result = model.transform(df_dates)
result_df = result.select("text",F.explode(F.arrays_zip(result.date_chunk.result,
result.normalized_date.result,
result.replaced_document.result)).alias("cols")) \
.select("text",F.expr("cols['1']").alias("normalized_date"),
F.expr("cols['2']").alias("replaced_document"))
result_df.show(truncate=False)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |text |normalized_date|replaced_document | +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |The bond's maturity date is set for Jan 31, 2024, at which point the principal amount will be repaid to the bondholders. |2024/01/31 |The bond's maturity date is set for 2024/01/31, at which point the principal amount will be repaid to the bondholders. | |As of the close of business on October 5, 2023, the stock price of XYZ Company reached a new all-time high of $150 per share, reflecting a 20% increase from the previous day's closing price.|2023/10/05 |As of the close of business on 2023/10/05, the stock price of XYZ Company reached a new all-time high of $150 per share, reflecting a 20% increase from the previous day's closing price.| |On January 15th, 2023, the company reported a record-breaking revenue of $10 million for the fourth quarter. |2023/01/15 |On 2023/01/15, the company reported a record-breaking revenue of $10 million for the fourth quarter. | |The annual shareholders' meeting is scheduled for 31/05/2023, where the company's financial performance will be discussed. |2023/05/31 |The annual shareholders' meeting is scheduled for 2023/05/31, where the company's financial performance will be discussed. | |The stock market experienced a significant downturn on 21June2022, causing a sharp decline in investors' portfolios. |2022/06/21 |The stock market experienced a significant downturn on 2022/06/21, causing a sharp decline in investors' portfolios. | |The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 12/2023. |2023/12/15 |The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 2023/12/15. | +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
With the new setOutputDateformat feature of DateNormalizer, date outputs can be customized in us
: MM/DD/YYYY
or eu
: DD/MM/YYYY
format.
date_normalizer_us = finance.DateNormalizer()\
.setInputCols('date_chunk')\
.setOutputCol('normalized_date_us')\
.setOutputDateformat('us')
date_normalizer_eu = finance.DateNormalizer()\
.setInputCols('date_chunk')\
.setOutputCol('normalized_date_eu')\
.setOutputDateformat('eu')
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
embeddings,
ner_model,
ner_converter,
date_normalizer_us,
date_normalizer_eu
])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
result = model.transform(df_dates)
result_df = result.select("text",F.explode(F.arrays_zip(result.date_chunk.result,
result.normalized_date_us.result,
result.normalized_date_eu.result)).alias("cols")) \
.select("text",F.expr("cols['0']").alias("date_chunk"),
F.expr("cols['1']").alias("normalized_date_us"),
F.expr("cols['2']").alias("normalized_date_eu"))
result_df.show(truncate=False)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+------------------+ |text |date_chunk |normalized_date_us|normalized_date_eu| +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+------------------+ |The bond's maturity date is set for Jan 31, 2024, at which point the principal amount will be repaid to the bondholders. |Jan 31, 2024 |01/31/2024 |31/01/2024 | |As of the close of business on October 5, 2023, the stock price of XYZ Company reached a new all-time high of $150 per share, reflecting a 20% increase from the previous day's closing price.|October 5, 2023 |10/05/2023 |05/10/2023 | |On January 15th, 2023, the company reported a record-breaking revenue of $10 million for the fourth quarter. |January 15th, 2023|01/15/2023 |15/01/2023 | |The annual shareholders' meeting is scheduled for 31/05/2023, where the company's financial performance will be discussed. |31/05/2023 |05/31/2023 |31/05/2023 | |The stock market experienced a significant downturn on 21June2022, causing a sharp decline in investors' portfolios. |21June2022 |06/21/2022 |21/06/2022 | |The quarterly dividend payment of $0.50 per share will be distributed to shareholders on 12/2023. |12/2023 |12/15/2023 |15/12/2023 | +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+------------------+
If any of the day, month and year information is missing in the date format, the following default values are added.
setDefaultReplacementDay
: default value is 15setDefaultReplacementMonth
: default value is July or 6setDefaultReplacementYear
: default value is 2020date_normalizer_us = finance.DateNormalizer()\
.setInputCols('date_chunk')\
.setOutputCol('normalized_date_us')\
.setOutputDateformat('us')\
.setDefaultReplacementDay(2)\
.setDefaultReplacementMonth(3)\
.setDefaultReplacementYear(2024)
nlpPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
embeddings,
ner_model,
ner_converter,
date_normalizer_us
])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
dates = [
'08/02',
'11/2018',
'03/2021',
'05 Jan',
'01/05',
'2022'
]
df_dates = spark.createDataFrame(dates,StringType()).toDF('text')
result = model.transform(df_dates)
result_df = result.select("text",F.explode(F.arrays_zip(result.date_chunk.result,
result.normalized_date_us.result)).alias("cols")) \
.select("text",F.expr("cols['0']").alias("date_chunk"),
F.expr("cols['1']").alias("normalized_date_us"))
result_df.show(truncate=False)
+-------+----------+------------------+ |text |date_chunk|normalized_date_us| +-------+----------+------------------+ |08/02 |08/02 |08/02/2024 | |11/2018|11/2018 |11/02/2018 | |03/2021|03/2021 |03/02/2021 | |01/05 |01/05 |01/05/2024 | +-------+----------+------------------+
We are going to create a chunks dates with different formats:
dates = [
'08/02/2018',
'11/2018',
'11/01/2018',
'12Mar2021',
'Jan 30, 2018',
'13.04.1999',
'3April 2020',
'03/2021',
'05 Jan',
'01/05',
'2022'
]
df_dates = spark.createDataFrame(dates,StringType()).toDF('ner_chunk')
We are going to transform that text to documents in spark-nlp.
document_assembler = nlp.DocumentAssembler().setInputCol('ner_chunk').setOutputCol('document')
documents_DF = document_assembler.transform(df_dates)
After that we are going to transform that documents to chunks.
chunks_df = nlp.map_annotations_col(documents_DF.select("document","ner_chunk"),
lambda x: [nlp.Annotation('chunk', a.begin, a.end, a.result, a.metadata, a.embeddings) for a in x], "document",
"chunk_date", "chunk")
chunks_df.select('chunk_date').show(truncate=False)
+---------------------------------------------------+ |chunk_date | +---------------------------------------------------+ |[{chunk, 0, 9, 08/02/2018, {sentence -> 0}, []}] | |[{chunk, 0, 6, 11/2018, {sentence -> 0}, []}] | |[{chunk, 0, 9, 11/01/2018, {sentence -> 0}, []}] | |[{chunk, 0, 8, 12Mar2021, {sentence -> 0}, []}] | |[{chunk, 0, 11, Jan 30, 2018, {sentence -> 0}, []}]| |[{chunk, 0, 9, 13.04.1999, {sentence -> 0}, []}] | |[{chunk, 0, 10, 3April 2020, {sentence -> 0}, []}] | |[{chunk, 0, 6, 03/2021, {sentence -> 0}, []}] | |[{chunk, 0, 5, 05 Jan, {sentence -> 0}, []}] | |[{chunk, 0, 4, 01/05, {sentence -> 0}, []}] | |[{chunk, 0, 3, 2022, {sentence -> 0}, []}] | +---------------------------------------------------+
Now we are going to normalize those chunks using the DateNormalizer.
date_normalizer = finance.DateNormalizer().setInputCols('chunk_date').setOutputCol('date')
date_normalized_df = date_normalizer.transform(chunks_df)
We are going to show how the date is normalized.
dateNormalizedClean = date_normalized_df.selectExpr("ner_chunk","date.result as date_result","date.metadata as metadata")
dateNormalizedClean.withColumn("date_result", dateNormalizedClean["date_result"]
.getItem(0)).withColumn("metadata", dateNormalizedClean["metadata"]
.getItem(0)['normalized']).show(truncate=False)
+------------+-----------+--------+ |ner_chunk |date_result|metadata| +------------+-----------+--------+ |08/02/2018 |2018/08/02 |true | |11/2018 |2018/11/15 |true | |11/01/2018 |2018/11/01 |true | |12Mar2021 |2021/03/12 |true | |Jan 30, 2018|2018/01/30 |true | |13.04.1999 |1999/04/13 |true | |3April 2020 |2020/04/03 |true | |03/2021 |2021/03/15 |true | |05 Jan |2020/01/05 |true | |01/05 |2020/01/05 |true | |2022 |2022/06/15 |true | +------------+-----------+--------+
We can configure the anchorDateYear
,anchorDateMonth
and anchorDateDay
for the relatives dates.
rel_dates = [
'next monday',
'today',
'next week'
]
rel_dates_df = spark.createDataFrame(rel_dates,StringType()).toDF('ner_chunk')
rel_documents_DF = document_assembler.transform(rel_dates_df)
rel_chunks_df = nlp.map_annotations_col(rel_documents_DF.select("document","ner_chunk"),
lambda x: [nlp.Annotation('chunk', a.begin, a.end, a.result, a.metadata, a.embeddings) for a in x], "document",
"chunk_date", "chunk")
rel_chunks_df.select('chunk_date').show(truncate=False)
+--------------------------------------------------+ |chunk_date | +--------------------------------------------------+ |[{chunk, 0, 10, next monday, {sentence -> 0}, []}]| |[{chunk, 0, 4, today, {sentence -> 0}, []}] | |[{chunk, 0, 8, next week, {sentence -> 0}, []}] | +--------------------------------------------------+
In the following example we will use as a relative date 2021/02/16, to make that possible we need to set up the anchorDateYear
to 2021, the anchorDateMonth
to 2 and the anchorDateDay
to 16. We will show you the configuration with the following example.
rel_date_normalizer = finance.DateNormalizer()\
.setInputCols('chunk_date')\
.setOutputCol('date')\
.setAnchorDateDay(16)\
.setAnchorDateMonth(2)\
.setAnchorDateYear(2021)
rel_date_normalized_df = rel_date_normalizer.transform(rel_chunks_df)
relDateNormalizedClean = rel_date_normalized_df.selectExpr("ner_chunk","date.result as date_result","date.metadata as metadata")
relDateNormalizedClean.withColumn("date_result", relDateNormalizedClean["date_result"].getItem(0))\
.withColumn("metadata", relDateNormalizedClean["metadata"].getItem(0)['normalized']).show(truncate=False)
+-----------+-----------+--------+ |ner_chunk |date_result|metadata| +-----------+-----------+--------+ |next monday|2021/02/22 |true | |today |2021/02/16 |true | |next week |2021/02/23 |true | +-----------+-----------+--------+
As you see the relatives dates like next monday
, today
and next week
takes the 2021/02/16
as reference date.