This notebooks aims to showcase how to use the Deidentification module in johnsnowlabs
library as a helper to carry out all deidentification tasks without any low code.
! pip install -q johnsnowlabs
Using my.johnsnowlabs.com SSO
from johnsnowlabs import nlp, legal
# nlp.install(force_browser=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
nlp.install()
spark = nlp.start()
import pandas as pd
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql as SQL
from pyspark import keyword_only
Description of Parameters:
custom_pipeline
: Sparknlp PipelineModel, optional
custom PipelineModel to be used for deidentification, by default None
ner_chunk
: str, optional
final chunk column name of custom pipeline that will be deidentified, by default "ner_chunk"
fields
: dict, optional
fields to be deidentified and their deidentification modes, by default {"text": "mask"}
sentence
: str, optional
sentence column name of the given custom pipeline, by default "sentence"
token
: str, optional
token column name of the given custom pipeline, by default "token"
document
: str, optional
document column name of the given custom pipeline, by default "document"
masking_policy
: str, optional
masking policy, by default "entity_labels"
fixed_mask_length
: int, optional
fixed mask length, by default 4
obfuscate_date
: bool, optional
obfuscate date, by default True
obfuscate_ref_source
: str, optional
obfuscate reference source, by default "faker"
obfuscate_ref_file_path
: str, optional
obfuscate reference file path, by default None
age_group_obfuscation
: bool, optional
age group obfuscation, by default False
age_ranges
: list, optional
age ranges for obfuscation, by default [1, 4, 12, 20, 40, 60, 80]
shift_days
: bool, optional
shift days, by default False
number_of_days
: int, optional
number of days, by default None
documentHashCoder_col_name
: str, optional
document hash coder column name, by default "documentHash"
date_tag
: str, optional
date tag, by default "DATE"
language
: str, optional
language, by default "en"
region
: str, optional
region, by default "us"
unnormalized_date
: bool, optional
unnormalized date, by default False
unnormalized_mode
: str, optional
unnormalized mode, by default "mask"
id_column_name
: str, optional
ID column name, by default "id"
date_shift_column_name
: str, optional
date shift column name, by default "date_shift"
separator
: str, optional
separator of input csv file, by default "\t"
input_file_path
: str, optional
input file path, by default None
output_file_path
: str, optional
output file path, by default 'deidentified.csv'
Returns
Spark DataFrame: Spark DataFrame with deidentified text
csv/json file: A deidentified file.
text= """EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corporation (the "Company"), and John E. Smith (the "Employee").
This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) is adopted, effective as of August 21, 2008, as set forth below."""
df = spark.createDataFrame([[text]]).toDF("text")
df_pd = df.toPandas()
df_pd.to_csv("deid_data.csv", sep='@', index=False)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = nlp.SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
.setInputCols(["sentence", "token"]) \
.setOutputCol("embeddings")
legal_ner = legal.NerModel.pretrained("legner_contract_doc_parties_lg", "en", "legal/models")\
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
#.setLabelCasing("upper")
ner_converter = legal.NerConverterInternal() \
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")
nlpPipeline = nlp.Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
embeddings,
legal_ner,
ner_converter])
empty_data = spark.createDataFrame([[""]]).toDF("text")
model = nlpPipeline.fit(empty_data)
roberta_embeddings_legal_roberta_base download started this may take some time. Approximate size to download 447.2 MB [OK!] legner_contract_doc_parties_lg download started this may take some time. [OK!]
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))
result.show()
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | text| document| sentence| token| embeddings| ner| ner_chunk| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ |EMPLOYMENT AGREEM...|[{document, 0, 39...|[{document, 0, 17...|[{token, 0, 9, EM...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 19, E...| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
result.select(F.explode('sentence')).show(truncate=50)
+--------------------------------------------------+ | col| +--------------------------------------------------+ |{document, 0, 176, EMPLOYMENT AGREEMENT, effe...| |{document, 178, 396, This First Amendment (Amen...| +--------------------------------------------------+
from pyspark.sql import functions as F
result_df = result.select(F.explode(F.arrays_zip(result.token.result,
result.ner.result)).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
F.expr("cols['1']").alias("ner_label"))
result_df.show()
+-----------+---------+ | token|ner_label| +-----------+---------+ | EMPLOYMENT| B-DOC| | AGREEMENT| I-DOC| | ,| O| | effective| O| | as| O| | of| O| | June|B-EFFDATE| | 1|I-EFFDATE| | ,|I-EFFDATE| | 2013|I-EFFDATE| | between| O| | Synergy| B-PARTY| | Resources| I-PARTY| |Corporation| I-PARTY| | ,| O| | a| O| | Colorado| O| |corporation| O| | (| O| | the| O| +-----------+---------+ only showing top 20 rows
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)
+---------+-----+ |ner_label|count| +---------+-----+ |O |49 | |I-PARTY |10 | |I-EFFDATE|6 | |B-ALIAS |4 | |B-PARTY |4 | |B-DOC |2 | |I-DOC |2 | |B-EFFDATE|2 | +---------+-----+
deid_implementor = legal.Deid(spark,
input_file_path="deid_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=model,
separator='@')
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(n=50, truncate=False)
+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corporation (the "Company"), and John E. Smith (the "Employee"). |<DOC>, effective as of <EFFDATE> between <PARTY>, a Colorado corporation (the "<ALIAS>"), and <PARTY> (the "<ALIAS>"). | |0 |This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) is adopted, effective as of August 21, 2008, as set forth below.|This First Amendment (Amendment) to the <DOC> between <PARTY> located in Stockton, California (<ALIAS>) and <PARTY> (<ALIAS>) is adopted, effective as of <EFFDATE>, as set forth below.| +---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
#checking saved output file
import pandas as pd
res_data = pd.read_csv("deidentified.csv")
res_data.head()
ID | text | text_deidentified | |
---|---|---|---|
0 | 0 | EMPLOYMENT AGREEMENT, effective as of Jun... | <DOC>, effective as of <EFFDATE> between... |
1 | 0 | This First Amendment (Amendment) to the Employ... | This First Amendment (Amendment) to the <DOC> ... |
deid_implementor = legal.Deid(spark,
input_file_path="deid_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=model,
fields={"text": "mask"}, masking_policy="same_length_chars")
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=120)
+---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | ID| text| text_deidentified| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | 0|EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corpo...|[******************], effective as of [************] between [****************************], a Colorado corpo...| | 0|This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California...|This First Amendment (Amendment) to the [******************] between [**************] located in Stockton, California...| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
deid_implementor = legal.Deid(spark,
input_file_path="deid_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=model,
fields={"text": "mask"}, masking_policy="fixed_length_chars", fixed_mask_length=2)
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=120)
+---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | ID| text| text_deidentified| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | 0|EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corpo...| **, effective as of ** between **, a Colorado corporation (the "**"), and ** (the "**").| | 0|This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California...|This First Amendment (Amendment) to the ** between ** located in Stockton, California (**) and ** (**) is adopted, ef...| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
obs_lines = """John Snow Labs#PARTY
Amazon INC#PARTY
1st June, 2023#EFFDATE
23 of July, 2023#EFFDATE
Party 1#ALIAS
Party 2#ALIAS
PRIVATE AGREEMENT#DOC
CONTRACT#DOC
"""
with open ('obfuscation.txt', 'w') as f:
f.write(obs_lines)
df= spark.createDataFrame([[text]]).toDF("text")
df_pd= df.toPandas()
df_pd.to_csv("deid_obfs_data.csv", index=False)
deid_implementor = legal.Deid(spark,
input_file_path="deid_obfs_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=model,
fields={"text": "obfuscate"}, obfuscate_ref_source="file",
obfuscate_ref_file_path="obfuscation.txt")
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=120)
+---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | ID| text| text_deidentified| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | 0|EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corpo...|CONTRACT, effective as of 1st June, 2023 between John Snow Labs, a Colorado corporation (the "Party 2"), and...| | 0|This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California...|This First Amendment (Amendment) to the CONTRACT between Amazon INC located in Stockton, California (Party 1) and Ama...| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
You can also use our internal faker library has its own vocabulary (with a predefined vocabulary for ORG, DOCUMENT TYPES, etc).
However, some entities may not be supported by faker, as the number of models increase in the Financial NLP library. If so, you will just see
In that case, please come back to a mixed or file-only approaches.
This option uses both internal faker library and the file.
deid_implementor = legal.Deid(spark,
input_file_path="deid_obfs_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=model,
fields={"text": "obfuscate"}, obfuscate_ref_source="both",
obfuscate_ref_file_path="obfuscation.txt")
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=120)
+---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | ID| text| text_deidentified| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+ | 0|EMPLOYMENT AGREEMENT, effective as of June 1, 2013 between Synergy Resources Corporation, a Colorado corpo...|CONTRACT, effective as of 1st June, 2023 between Amazon INC, a Colorado corporation (the "Party 2"), and Joh...| | 0|This First Amendment (Amendment) to the Employment Agreement between Service 1st Bank located in Stockton, California...|This First Amendment (Amendment) to the PRIVATE AGREEMENT between John Snow Labs located in Stockton, California (Par...| +---+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
import pandas as pd
data = pd.DataFrame(
{'clientID' : ['A001', 'A001', 'A002'],
'text' : ['EMPLOYMENT AGREEMENT, effective as of June 1, 2013',
'This First Amendment adopted, effective as of August 21, 2008',
'Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 01/06/2023'
]
}
)
my_input_df = spark.createDataFrame(data)
my_input_df.show(truncate = False)
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+ |clientID|text | +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+ |A001 |EMPLOYMENT AGREEMENT, effective as of June 1, 2013 | |A001 |This First Amendment adopted, effective as of August 21, 2008 | |A002 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 01/06/2023| +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+
data
clientID | text | |
---|---|---|
0 | A001 | EMPLOYMENT AGREEMENT, effective as of June 1, ... |
1 | A001 | This First Amendment adopted, effective as of ... |
2 | A002 | Amendment to the Employment Agreement between ... |
documentAssembler = nlp.DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
date = nlp.DateMatcher() \
.setInputCols("document") \
.setOutputCol("date") \
.setOutputFormat("dd/MM/yyyy")
#.setAnchorDateYear(2020) \ Use these if you want to stick with a specific month and range of days
#.setAnchorDateMonth(1) \
#.setAnchorDateDay(11) \
pipeline = nlp.Pipeline().setStages([
documentAssembler,
date
])
result = pipeline.fit(my_input_df).transform(my_input_df)
result.select("clientID", "text", "date.begin", "date.end", "date.result").show(truncate=False)
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----+------------+ |clientID|text |begin|end |result | +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----+------------+ |A001 |EMPLOYMENT AGREEMENT, effective as of June 1, 2013 |[38] |[49] |[01/06/2013]| |A001 |This First Amendment adopted, effective as of August 21, 2008 |[46] |[60] |[21/08/2008]| |A002 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 01/06/2023|[135]|[144]|[06/01/2023]| +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----+------------+
You can then use result
and text[begin:end]
to modify your original strings
We will normalize the dates first using a DateMatcher
import pandas as pd
data = pd.DataFrame(
{'clientID' : ['A001', 'A001', 'A002'],
'text' : ['EMPLOYMENT AGREEMENT, effective as of 01/06/2013',
'This First Amendment adopted, effective as of 21/08/2008',
'Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023'
]
}
)
my_input_df = spark.createDataFrame(data)
my_input_df.show(truncate = False)
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+ |clientID|text | +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+ |A001 |EMPLOYMENT AGREEMENT, effective as of 01/06/2013 | |A001 |This First Amendment adopted, effective as of 21/08/2008 | |A002 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023| +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+
df_pd = my_input_df.toPandas()
df_pd.to_csv("deid_id_data.csv", index=False)
Custom pipeline with DocumentHashCoder()
.
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
documentHasher = legal.DocumentHashCoder()\
.setInputCols("document")\
.setOutputCol("document2")\
.setRangeDays(100)\
.setNewDateShift("shift_days")\
.setPatientIdColumn("clientID")\
.setSeed(100)
tokenizer = nlp.Tokenizer()\
.setInputCols(["document2"])\
.setOutputCol("token")
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
.setInputCols(["document2", "token"]) \
.setOutputCol("embeddings")
ner_model = legal.NerModel.pretrained('legner_deid', "en", "legal/models")\
.setInputCols(["document2", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = nlp.NerConverter()\
.setInputCols(["document2","token","ner"])\
.setOutputCol("ner_chunk")
nlpPipeline = nlp.Pipeline().setStages([
documentAssembler,
documentHasher,
tokenizer,
embeddings,
ner_model,
ner_converter])
empty_data = spark.createDataFrame([["", ""]]).toDF("text", "clientID")
pipeline_model = nlpPipeline.fit(empty_data)
roberta_embeddings_legal_roberta_base download started this may take some time. Approximate size to download 447.2 MB [OK!] legner_deid download started this may take some time. [OK!]
deid_implementor = legal.Deid(spark,
input_file_path="deid_id_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=pipeline_model,
fields={"text": "obfuscate"},
shift_days=True,
obfuscate_date=True,
ner_chunk="ner_chunk",
token="token",
documenthashcoder_col_name="document2",
separator=",",
unnormalized_date=False)
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of 01/06/2013 |EMPLOYMENT AGREEMENT, effective as of 01/01/2013 | |1 |This First Amendment adopted, effective as of 21/08/2008 |This First Amendment adopted, effective as of 16/08/2008 | |2 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|Amendment to the Employment Agreement between <ORG> located in Conway, Oklahoma (Bank) and <PERSON> (Executive) by 06/30/2023| +---+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
data = pd.DataFrame(
{'clientID' : ['A001', 'A001', 'A002'],
'text' : ['EMPLOYMENT AGREEMENT, effective as of 10 June 2013',
'This First Amendment adopted, effective as of August 8th, 2008',
'Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023'
],
'dateshift' : ['10', '-2', '30']
}
)
my_input_df = spark.createDataFrame(data)
my_input_df.show(truncate=False)
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+ |clientID|text |dateshift| +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+ |A001 |EMPLOYMENT AGREEMENT, effective as of 10 June 2013 |10 | |A001 |This First Amendment adopted, effective as of August 8th, 2008 |-2 | |A002 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|30 | +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+
df_pd= my_input_df.toPandas()
df_pd.to_csv("deid_specific_data.csv", index=False)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
documentHasher = legal.DocumentHashCoder()\
.setInputCols("document")\
.setOutputCol("document2")\
.setDateShiftColumn("dateshift")\
tokenizer = nlp.Tokenizer()\
.setInputCols(["document2"])\
.setOutputCol("token")
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
.setInputCols(["document2", "token"]) \
.setOutputCol("embeddings")
ner_model = legal.NerModel.pretrained('legner_deid', "en", "legal/models")\
.setInputCols(["document2", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = nlp.NerConverter()\
.setInputCols(["document2","token","ner"])\
.setOutputCol("ner_chunk")
nlpPipeline = nlp.Pipeline().setStages([
documentAssembler,
documentHasher,
tokenizer,
embeddings,
ner_model,
ner_converter])
empty_data = spark.createDataFrame([["", "", ""]]).toDF("clientID","text", "dateshift")
pipeline_col_model = nlpPipeline.fit(empty_data)
roberta_embeddings_legal_roberta_base download started this may take some time. Approximate size to download 447.2 MB [OK!] legner_deid download started this may take some time. [OK!]
deid_implementor = legal.Deid(spark,
input_file_path="deid_specific_data.csv",
separator=",",
output_file_path="deid_specific_data.csv",
custom_pipeline=pipeline_col_model,
fields={"text": "obfuscate"},
shift_days=True,
obfuscate_date=True,
ner_chunk="ner_chunk",
token="token",
documenthashcoder_col_name="document2")
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deid_specific_data.csv' !
res.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of 10 June 2013 |EMPLOYMENT AGREEMENT, effective as of 20 June 2013 | |1 |This First Amendment adopted, effective as of August 8th, 2008 |This First Amendment adopted, effective as of August 6th, 2008 | |2 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|Amendment to the Employment Agreement between <ORG> located in Oakland, South Carolina (Bank) and <PERSON> (Executive) by 07/01/2023| +---+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+
import pandas as pd
data = pd.DataFrame(
{'clientID' : ['A001', 'A001', 'A002'],
'text' : ['EMPLOYMENT AGREEMENT, effective as of 3May2002',
'This First Amendment adopted, effective as of Agust 8th, 2008',
'Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023'
],
'dateshift' : ['10', '-2', '30']
}
)
my_input_df = spark.createDataFrame(data)
my_input_df.show(truncate=False)
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+ |clientID|text |dateshift| +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+ |A001 |EMPLOYMENT AGREEMENT, effective as of 3May2002 |10 | |A001 |This First Amendment adopted, effective as of Agust 8th, 2008 |-2 | |A002 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|30 | +--------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------+
df_pd = my_input_df.toPandas()
df_pd.to_csv("deid_unnormalized_data.csv", index=False)
deid_implementor = legal.Deid(spark,
input_file_path="deid_unnormalized_data.csv",
output_file_path="deidentified.csv",
custom_pipeline=pipeline_col_model,
fields={"text": "obfuscate"},
shift_days=True,
obfuscate_date=True,
ner_chunk="ner_chunk",
token="token",
documenthashcoder_col_name="document2",
separator=",",
unnormalized_date=True,
unnormalized_mode="mask"
)
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified.csv' !
res.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of 3May2002 |EMPLOYMENT AGREEMENT, effective as of <DATE> | |1 |This First Amendment adopted, effective as of Agust 8th, 2008 |This First Amendment adopted, effective as of <DATE> | |2 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|Amendment to the Employment Agreement between <ORG> located in Toruń, Wyoming (Bank) and <PERSON> (Executive) by 07/01/2023| +---+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
unnormalized_mode="obfuscate"
deid_implementor = legal.Deid(spark,
input_file_path="deid_unnormalized_data.csv",
output_file_path="deidentified1.csv",
custom_pipeline=pipeline_col_model,
fields={"text": "obfuscate"},
shift_days=True,
obfuscate_date=True,
ner_chunk="ner_chunk",
token="token",
documenthashcoder_col_name="document2",
separator=",",
unnormalized_date=True,
unnormalized_mode="obfuscate"
)
res = deid_implementor.deidentify()
Deidentification process of the 'text' field has begun... Deidentification process of the 'text' field was completed... Deidentifcation successfully completed and the results saved as 'deidentified1.csv' !
res.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of 3May2002 |EMPLOYMENT AGREEMENT, effective as of 04-04-1987 | |1 |This First Amendment adopted, effective as of Agust 8th, 2008 |This First Amendment adopted, effective as of 11-03-2000 | |2 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|Amendment to the Employment Agreement between <ORG> located in Cushing, Iowa (Bank) and <PERSON> (Executive) by 07/01/2023| +---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
This pipeline does not include the masking of DOCUMENT TYPE.
deid_implementor = legal.Deid(spark,
ner_chunk = "merged_ner_chunks",
input_file_path="deid_data.csv",
output_file_path="deidentified_custompipe.csv",
domain="legal")
res.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+ |ID |text |text_deidentified | +---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+ |0 |EMPLOYMENT AGREEMENT, effective as of 3May2002 |EMPLOYMENT AGREEMENT, effective as of 04-04-1987 | |1 |This First Amendment adopted, effective as of Agust 8th, 2008 |This First Amendment adopted, effective as of 11-03-2000 | |2 |Amendment to the Employment Agreement between Service 1st Bank located in Stockton, California (Bank) and John E. Smith (Executive) by 06/01/2023|Amendment to the Employment Agreement between <ORG> located in Cushing, Iowa (Bank) and <PERSON> (Executive) by 07/01/2023| +---+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
# sample data
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/hipaa-table-001.txt
df = spark.read.format("csv") \
.option("sep", "\t") \
.option("inferSchema", "true") \
.option("header", "true") \
.load("hipaa-table-001.txt")
df.show(truncate=False)
+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+ |NAME |DOB |AGE|ADDRESS |ZIPCODE|TEL |SBP|DBP| +---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+ |Cecilia Chapman|04/02/1935|83 |711-2880 Nulla St. Mankato Mississippi |69200 |(257) 563-7401|101|42 | |Iris Watson |03/10/2009|9 |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska |20620 |(372) 587-2335|159|122| |Bryar Pitts |11/01/1921|98 |5543 Aliquet St. Fort Dodge GA |20783 |(717) 450-4729|149|52 | |Theodore Lowe |13/02/2002|16 |Ap #867-859 Sit Rd. Azusa New York |39531 |(793) 151-6230|134|115| |Calista Wise |20/08/1942|76 |7292 Dictum Av. San Antonio MI |47096 |(492) 709-6392|139|78 | |Kyla Olsen |12/05/1973|45 |Ap #651-8679 Sodales Av. Tamuning PA |10855 |(654) 393-5734|120|112| |Forrest Ray |11/01/1991|27 |191-103 Integer Rd. Corona New Mexico |8219 |(404) 960-3807|143|126| |Hiroko Potter |18/11/1937|81 |P.O. Box 887 2508 Dolor. Av. Muskegon KY |12482 |(314) 244-6306|147|75 | |Celeste Slater |12/05/1980|38 |606-3727 Ullamcorper. Street Roseville NH |11523 |(786) 713-8616|147|123| |Nyssa Vazquez |24/09/1956|62 |511-5762 At Rd. Chelsea MI |67708 |(947) 278-5929|129|50 | |Lawrence Moreno|26/12/1906|112|935-9940 Tortor. Street Santa Rosa MN |98804 |(684) 579-1879|133|102| |Ina Moran |26/10/1983|35 |P.O. Box 929 4189 Nunc Road Lebanon KY |69409 |(389) 737-2852|101|67 | |Aaron Hawkins |26/09/2009|9 |5587 Nunc. Avenue Erie Rhode Island |24975 |(660) 663-4518|87 |81 | |Hedy Greene |03/10/1920|98 |Ap #696-3279 Viverra. Avenue Latrobe DE |38100 |(608) 265-2215|128|123| |Melvin Porter |14/08/1911|107|P.O. Box 132 1599 Curabitur Rd. Bandera South Dakota|45149 |(959) 119-8364|83 |43 | |Keefe Sellers |16/05/1937|81 |347-7666 Iaculis St. Woodruff SC |49854 |(468) 353-2641|148|109| |Joan Romero |08/12/2004|14 |666-4366 Lacinia Avenue Idaho Falls Ohio |19253 |(248) 675-4007|75 |53 | |Davis Patrick |09/01/1956|63 |P.O. Box 147 2546 Sociosqu Rd. Bethlehem Utah |2913 |(939) 353-1107|142|62 | |Leilani Boyer |18/10/1934|84 |557-6308 Lacinia Road San Bernardino ND |9289 |(570) 873-7090|137|48 | |Colby Bernard |02/10/1905|113|Ap #285-7193 Ullamcorper Avenue Amesbury HI |93373 |(302) 259-2375|84 |41 | +---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+ only showing top 20 rows
obfuscator = legal.StructuredDeidentification(spark,{"NAME":"NAME","AGE":"AGE"}, obfuscateRefSource = "faker")
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)
+--------------------+----------+-----+----------------------------------------------------+-------+--------------+---+---+ |NAME |DOB |AGE |ADDRESS |ZIPCODE|TEL |SBP|DBP| +--------------------+----------+-----+----------------------------------------------------+-------+--------------+---+---+ |[Natasha Bence] |04/02/1935|[93] |711-2880 Nulla St. Mankato Mississippi |69200 |(257) 563-7401|101|42 | |[Karie Chimera] |03/10/2009|[5] |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska |20620 |(372) 587-2335|159|122| |[Lucita Ferrara] |11/01/1921|[93] |5543 Aliquet St. Fort Dodge GA |20783 |(717) 450-4729|149|52 | |[Lorane Gell] |13/02/2002|[12] |Ap #867-859 Sit Rd. Azusa New York |39531 |(793) 151-6230|134|115| |[Lowell Guitar] |20/08/1942|[73] |7292 Dictum Av. San Antonio MI |47096 |(492) 709-6392|139|78 | |[Jimmey Ralph] |12/05/1973|[46] |Ap #651-8679 Sodales Av. Tamuning PA |10855 |(654) 393-5734|120|112| |[Gwendlyn Deutscher]|11/01/1991|[30] |191-103 Integer Rd. Corona New Mexico |8219 |(404) 960-3807|143|126| |[Doyle Askew] |18/11/1937|[84] |P.O. Box 887 2508 Dolor. Av. Muskegon KY |12482 |(314) 244-6306|147|75 | |[Volanda Napoleon] |12/05/1980|[37] |606-3727 Ullamcorper. Street Roseville NH |11523 |(786) 713-8616|147|123| |[Curt Bears] |24/09/1956|[72] |511-5762 At Rd. Chelsea MI |67708 |(947) 278-5929|129|50 | |[Johna Sheriff] |26/12/1906|[106]|935-9940 Tortor. Street Santa Rosa MN |98804 |(684) 579-1879|133|102| |[Nedra Hai] |26/10/1983|[22] |P.O. Box 929 4189 Nunc Road Lebanon KY |69409 |(389) 737-2852|101|67 | |[Lady Saucier] |26/09/2009|[5] |5587 Nunc. Avenue Erie Rhode Island |24975 |(660) 663-4518|87 |81 | |[Gracy Bruins] |03/10/1920|[93] |Ap #696-3279 Viverra. Avenue Latrobe DE |38100 |(608) 265-2215|128|123| |[Lennie Hummer] |14/08/1911|[103]|P.O. Box 132 1599 Curabitur Rd. Bandera South Dakota|45149 |(959) 119-8364|83 |43 | |[Harvel Ricks] |16/05/1937|[84] |347-7666 Iaculis St. Woodruff SC |49854 |(468) 353-2641|148|109| |[Rip Harbour] |08/12/2004|[16] |666-4366 Lacinia Avenue Idaho Falls Ohio |19253 |(248) 675-4007|75 |53 | |[Steve Rattler] |09/01/1956|[68] |P.O. Box 147 2546 Sociosqu Rd. Bethlehem Utah |2913 |(939) 353-1107|142|62 | |[Dell Ponto] |18/10/1934|[82] |557-6308 Lacinia Road San Bernardino ND |9289 |(570) 873-7090|137|48 | |[Jaclyn Prime] |02/10/1905|[118]|Ap #285-7193 Ullamcorper Avenue Amesbury HI |93373 |(302) 259-2375|84 |41 | +--------------------+----------+-----+----------------------------------------------------+-------+--------------+---+---+ only showing top 20 rows
obfuscator_unique_ref_test = '''Will Perry#NAME
John Smith#NAME
Marvin MARSHALL#NAME
Hubert GROGAN#NAME
ALTHEA COLBURN#NAME
Kalil AMIN#NAME
Inci FOUNTAIN#NAME
Jackson WILLE#NAME
Jack SANTOS#NAME
Mahmood ALBURN#NAME
Marnie MELINGTON#NAME
Aysha GHAZI#NAME
Maryland CODER#NAME
Darene GEORGIOUS#NAME
Shelly WELLBECK#NAME
Min Kun JAE#NAME
Thomson THOMAS#NAME
Christian SUDDINBURG#NAME
20#AGE
30#AGE
40#AGE
50#AGE
60#AGE
(901)111-2222#TEL
(109)333 1343#TEL
(570) 874-1112#TEL
(901)111-2222#TEL
(109)333 1343#TEL
(570) 874-1112#TEL
28450#ZIPCODE
49144#ZIPCODE
14412#ZIPCODE
10/10/1983#DOB
04/06/1990#DOB
03/11/2001#DOB
'''
with open('obfuscator_unique_ref_test.txt', 'w') as f:
f.write(obfuscator_unique_ref_test)
obfuscator = legal.StructuredDeidentification(spark,{"NAME":"NAME","AGE":"AGE"},
obfuscateRefFile = "/content/obfuscator_unique_ref_test.txt",
obfuscateRefSource = "file",
columnsSeed={"NAME": 23, "AGE": 23})
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME","AGE").show(truncate=False)
+------------------+----+ |NAME |AGE | +------------------+----+ |[Inci FOUNTAIN] |[60]| |[Jack SANTOS] |[30]| |[Darene GEORGIOUS]|[30]| |[Shelly WELLBECK] |[40]| |[Hubert GROGAN] |[40]| |[Kalil AMIN] |[40]| |[ALTHEA COLBURN] |[60]| |[Thomson THOMAS] |[60]| |[Jack SANTOS] |[60]| |[Will Perry] |[20]| |[Jackson WILLE] |[60]| |[Shelly WELLBECK] |[40]| |[Kalil AMIN] |[30]| |[Marnie MELINGTON]|[30]| |[Min Kun JAE] |[30]| |[Marvin MARSHALL] |[60]| |[Marvin MARSHALL] |[50]| |[Min Kun JAE] |[30]| |[Maryland CODER] |[20]| |[Marnie MELINGTON]|[20]| +------------------+----+ only showing top 20 rows
# We can shift n days in the structured deidentification through "days" parameter when the column is a Date.
df = spark.createDataFrame([
["Juan García", "13/02/1977", "711 Nulla St.", "140", "673 431234"],
["Will Smith", "23/02/1977", "1 Green Avenue.", "140", "+23 (673) 431234"],
["Pedro Ximénez", "11/04/1900", "Calle del Libertador, 7", "100", "912 345623"]
]).toDF("NAME", "DOB", "ADDRESS", "SBP", "TEL")
df_pd= df.toPandas()
df_pd.to_csv("deid_dayshift_structured_data.csv", index=False)
df_pd.head()
NAME | DOB | ADDRESS | SBP | TEL | |
---|---|---|---|---|---|
0 | Juan García | 13/02/1977 | 711 Nulla St. | 140 | 673 431234 |
1 | Will Smith | 23/02/1977 | 1 Green Avenue. | 140 | +23 (673) 431234 |
2 | Pedro Ximénez | 11/04/1900 | Calle del Libertador, 7 | 100 | 912 345623 |
obfuscator = legal.StructuredDeidentification(spark,
columns = {"NAME": "ID", "DOB": "DATE"},
obfuscateRefSource = "faker",
columnsSeed={"NAME": 23, "AGE": 23},
days = 5)
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)
+----------+------------+-----------------------+---+----------------+ |NAME |DOB |ADDRESS |SBP|TEL | +----------+------------+-----------------------+---+----------------+ |[G9296129]|[18/02/1977]|711 Nulla St. |140|673 431234 | |[M9239301]|[28/02/1977]|1 Green Avenue. |140|+23 (673) 431234| |[H3156881]|[16/04/1900]|Calle del Libertador, 7|100|912 345623 | +----------+------------+-----------------------+---+----------------+