Modern NLP models work with a numerical representation of texts and their menaning. For token classification problems (inferring a class for a token, for example Name Entity Recognition) Word Embeddings are required. For sentences, paragraph, document classification - we use Sentence Embeddings.
In this notebook, we got token embeddings using Spark NLP Finance Word Embeddings(bert_embeddings_sec_bert_base) and using these token embeddings we got sentence embeddings by sparknlp annotator SentenceEmbeddings to get those numerical representations of the semantics of the texts. The result is a 768 embeddings matrix, impossible to process by the human eye.
There are many techniques we can use to visualize those embeddings. We are using one of them - Principal Component Analysis, a dimensionality reduction process, carried out by Spark MLLib. Both embeddings have 768 dimensions, so we will reduced this dimensions from 768 to 3 (X, Y, Z) and will use a color for the word / sentence legend.
! pip install johnsnowlabs
Using my.johnsnowlabs.com SSO
from johnsnowlabs import nlp, finance
# nlp.install(force_browser=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
nlp.install()
spark = nlp.start()
! pip install -q plotly
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/finance_pca_samples.csv
import pandas as pd
df = pd.read_csv("finance_pca_samples.csv")
# Create spark dataframe
sdf = spark.createDataFrame(df)
sdf.show()
+--------------------+----------------+ | text| label| +--------------------+----------------+ |I called Huntingt...| Accounts| |I opened an citi ...| Accounts| |I have been a lon...| Credit Cards| |My credit limit w...| Credit Cards| |I am filing this ...|Credit Reporting| |The Credit Bureau...|Credit Reporting| |I noticed an arti...| Debt Collection| |A bank account wa...| Debt Collection| |I was contacted v...| Loans| |My husband recent...| Loans| |I wire transfered...| Money Transfers| |PayPal holds fund...| Money Transfers| |We have requested...| Mortgage| |I filled out a co...| Mortgage| +--------------------+----------------+
# We defined a generic pipeline for word and sentence embeddings
def generic_pipeline():
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = nlp.Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")
word_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
.setInputCols(["document", "token"])\
.setOutputCol("word_embeddings")
pipeline = nlp.Pipeline(stages = [
document_assembler,
tokenizer,
word_embeddings
])
return pipeline
embeddings_sentence = nlp.SentenceEmbeddings()\
.setInputCols(["document", "word_embeddings"])\
.setOutputCol("sentence_embeddings")\
.setPoolingStrategy("AVERAGE")
# We used sparknlp SentenceEmbeddings anootator to get each sentence embeddings from token embeddings
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql as SQL
from pyspark import keyword_only
# This class extracts the embeddings from the Spark NLP Annotation object
# from pyspark import ml as ML
class EmbeddingsUDF(
nlp.Transformer, nlp.ML.param.shared.HasInputCol, nlp.ML.param.shared.HasOutputCol,
nlp.ML.util.DefaultParamsReadable, nlp.ML.util.DefaultParamsWritable
):
@keyword_only
def __init__(self):
super(EmbeddingsUDF, self).__init__()
def _sum(r):
result = 0.0
for e in r:
result += e
return result
self.udfs = {
'convertToVectorUDF': F.udf(lambda vs: nlp.ML.linalg.Vectors.dense(vs), nlp.ML.linalg.VectorUDT()),
'sumUDF': F.udf(lambda r: _sum(r), T.FloatType())
}
def _transform(self, dataset):
results = dataset.select(
"*", F.explode("sentence_embeddings.embeddings").alias("embeddings")
)
results = results.withColumn(
"features",
self.udfs['convertToVectorUDF'](F.col("embeddings"))
)
results = results.withColumn(
"emb_sum",
self.udfs['sumUDF'](F.col("embeddings"))
)
# Remove those with embeddings all zeroes (so we can calculate cosine distance)
results = results.where(F.col("emb_sum")!=0.0)
return results
embeddings_for_pca = EmbeddingsUDF()
DIMENSIONS = 3
pca = nlp.ML.feature.PCA(k=DIMENSIONS, inputCol="features", outputCol="pca_features")
# We did all process in one pipeline
pipeline = nlp.Pipeline().setStages([generic_pipeline(), embeddings_sentence, embeddings_for_pca, pca])
bert_embeddings_sec_bert_base download started this may take some time. Approximate size to download 390.4 MB [OK!]
model = pipeline.fit(sdf)
result = model.transform(sdf)
result.select('pca_features', 'label').show(truncate=False)
+--------------------------------------------------------------+----------------+ |pca_features |label | +--------------------------------------------------------------+----------------+ |[3.39576448119276,-1.060361129782475,-1.568794006399417] |Accounts | |[2.3660850756971623,0.8591941003552866,-0.8066168807669747] |Accounts | |[0.6867735108170906,1.4823947144210112,0.006591220237646302] |Credit Cards | |[-0.28834125177427167,1.0031549697755784,-0.7963810505318434] |Credit Cards | |[-0.5037809008469382,-1.3771583372345915,0.4449701036930799] |Credit Reporting| |[1.039756950301059,-1.7194174825036457,1.8539366217014026] |Credit Reporting| |[2.7731701148109815,1.1680247656394984,1.3949448202984454] |Debt Collection | |[-0.45951034017887454,0.833969250052939,0.5051728405912744] |Debt Collection | |[0.2703079726928541,1.1069420631113542,-0.4247559623637921] |Loans | |[0.8662523064864315,1.1435249671794807,0.8703562689970329] |Loans | |[-0.7580966795506656,0.6312432474265479,0.6829074622939197] |Money Transfers | |[0.38557719496563764,-1.4420990245260328,-0.19825482305628117]|Money Transfers | |[2.45690397730987,0.33025601313067965,1.2981705024775965] |Mortgage | |[2.3553279838082126,0.8329467950039564,1.390405767602749] |Mortgage | +--------------------------------------------------------------+----------------+
df = result.select('pca_features', 'label').toPandas()
df
# As you see, dimension values are inside a list
pca_features | label | |
---|---|---|
0 | [3.39576448119276, -1.060361129782475, -1.5687... | Accounts |
1 | [2.3660850756971623, 0.8591941003552866, -0.80... | Accounts |
2 | [0.6867735108170906, 1.4823947144210112, 0.006... | Credit Cards |
3 | [-0.28834125177427167, 1.0031549697755784, -0.... | Credit Cards |
4 | [-0.5037809008469382, -1.3771583372345915, 0.4... | Credit Reporting |
5 | [1.039756950301059, -1.7194174825036457, 1.853... | Credit Reporting |
6 | [2.7731701148109815, 1.1680247656394984, 1.394... | Debt Collection |
7 | [-0.45951034017887454, 0.833969250052939, 0.50... | Debt Collection |
8 | [0.2703079726928541, 1.1069420631113542, -0.42... | Loans |
9 | [0.8662523064864315, 1.1435249671794807, 0.870... | Loans |
10 | [-0.7580966795506656, 0.6312432474265479, 0.68... | Money Transfers |
11 | [0.38557719496563764, -1.4420990245260328, -0.... | Money Transfers |
12 | [2.45690397730987, 0.33025601313067965, 1.2981... | Mortgage |
13 | [2.3553279838082126, 0.8329467950039564, 1.390... | Mortgage |
# We extract the dimension values out off the list
df["x"] = df["pca_features"].apply(lambda x: x[0])
df["y"] = df["pca_features"].apply(lambda x: x[1])
df["z"] = df["pca_features"].apply(lambda x: x[2])
df = df[["x", "y", "z", "label"]]
df
x | y | z | label | |
---|---|---|---|---|
0 | 3.395764 | -1.060361 | -1.568794 | Accounts |
1 | 2.366085 | 0.859194 | -0.806617 | Accounts |
2 | 0.686774 | 1.482395 | 0.006591 | Credit Cards |
3 | -0.288341 | 1.003155 | -0.796381 | Credit Cards |
4 | -0.503781 | -1.377158 | 0.444970 | Credit Reporting |
5 | 1.039757 | -1.719417 | 1.853937 | Credit Reporting |
6 | 2.773170 | 1.168025 | 1.394945 | Debt Collection |
7 | -0.459510 | 0.833969 | 0.505173 | Debt Collection |
8 | 0.270308 | 1.106942 | -0.424756 | Loans |
9 | 0.866252 | 1.143525 | 0.870356 | Loans |
10 | -0.758097 | 0.631243 | 0.682907 | Money Transfers |
11 | 0.385577 | -1.442099 | -0.198255 | Money Transfers |
12 | 2.456904 | 0.330256 | 1.298171 | Mortgage |
13 | 2.355328 | 0.832947 | 1.390406 | Mortgage |
import plotly.express as px
fig = px.scatter_3d(df, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=600)
fig.show()
We can also visualize the semantics of words, instead of full texts, by using Word Embeddings. We will add a Tokenizer and a WordEmbeddings model to get those embeddings, and them apply PCA as before. Firstly we splitted the pipeline in two to get all token embeddings
model = generic_pipeline().fit(sdf)
bert_embeddings_sec_bert_base download started this may take some time. Approximate size to download 390.4 MB [OK!]
result = model.transform(sdf)
result_df = result.select("label", F.explode(F.arrays_zip(result.token.result, result.word_embeddings.embeddings)).alias("cols"))\
.select(F.expr("cols['0']").alias("token"),
"label",
F.expr("cols['1']").alias("embeddings"))
result_df.show(truncate = 80)
+----------+--------+--------------------------------------------------------------------------------+ | token| label| embeddings| +----------+--------+--------------------------------------------------------------------------------+ | I|Accounts|[-0.29679197, 0.80952483, 0.026026089, 0.08434192, 0.7434629, -0.02694758, -0...| | called|Accounts|[0.28905854, -0.29229686, -0.42990392, -0.3833449, 0.026178285, -0.12728442, ...| |Huntington|Accounts|[0.20684586, -0.010130149, -0.259025, -0.37558293, 0.45792142, 0.3114912, -0....| | Bank|Accounts|[-0.034710683, 0.46047488, -0.6221113, -0.011169381, 0.2938512, 0.31341088, -...| | to|Accounts|[-0.40457863, -0.3768647, -0.08015404, -0.58909655, -0.33856544, -0.39321256,...| | close|Accounts|[0.35089388, 0.9568475, 0.86328286, -0.4334402, 0.11386797, -0.48837784, -0.8...| | my|Accounts|[-0.36591864, 0.2655603, -0.32495034, -0.5081896, -0.39623818, -0.63347244, -...| | account|Accounts|[0.004639961, 0.5340125, 0.77567977, 0.23316649, -0.4303767, -0.2937901, -0.5...| | ,|Accounts|[0.17874305, -0.026907753, 0.19498396, -0.7929611, -0.26044437, -0.3964327, -...| | and|Accounts|[0.5011346, 0.6637548, 0.15587743, -0.79522926, -0.8198417, -0.24028614, -0.6...| | they|Accounts|[-0.2188998, 0.17353022, -0.3897713, -0.4219988, -0.66089946, -0.6682683, -0....| | refused|Accounts|[-0.71534324, 0.4092898, -0.58240926, 0.2768947, -0.7440806, -0.016842518, -0...| | to|Accounts|[-0.062417023, -0.30230471, 0.17689183, -0.36983997, 0.22308639, -0.20912732,...| | close|Accounts|[0.49901608, 0.93363476, 0.89050376, -0.20053658, 0.47381917, -0.24397722, -0...| | my|Accounts|[-0.11864859, 0.068643466, -0.47048938, -0.33866596, -0.1448204, -0.59992373,...| | account|Accounts|[-0.045060933, 0.55244875, 0.9458424, 0.3263075, -0.26439214, -0.14597315, -0...| | over|Accounts|[0.19230467, -0.47188944, 0.33582675, 0.008950032, 0.3479425, 0.107840315, -0...| | the|Accounts|[-0.44176129, -0.17911726, -0.9623183, 0.09716578, 0.19224198, 0.1584882, 0.5...| | phone|Accounts|[-0.44973916, -0.9114662, -0.06911273, -0.18094938, 0.10837507, -0.8229777, -...| | .|Accounts|[0.0502675, 0.32013232, 0.22356117, -0.6540274, 0.48769465, -0.81690645, -0.6...| +----------+--------+--------------------------------------------------------------------------------+ only showing top 20 rows
# Here we defined inheritance class from that defined previously EmbeddingsUDF class
class WordEmbeddingsUDF(EmbeddingsUDF):
def _transform(self, dataset):
results = dataset.select('token', 'label', 'embeddings') # We changed this line because our embedding cloumn is already exploded
results = results.withColumn(
"features",
self.udfs['convertToVectorUDF'](F.col("embeddings"))
)
results = results.withColumn(
"emb_sum",
self.udfs['sumUDF'](F.col("embeddings"))
)
# Remove those with embeddings all zeroes (so we can calculate cosine distance)
results = results.where(F.col("emb_sum")!=0.0)
return results
embeddings_for_pca = WordEmbeddingsUDF()
DIMENSIONS = 3
pca = nlp.ML.feature.PCA(k=DIMENSIONS, inputCol="features", outputCol="pca_features")
# We run the second part of the pipeline. Here 768 dimensions is reduced to 3 dimensions
pipeline = nlp.Pipeline().setStages([embeddings_for_pca, pca])
model = pipeline.fit(result_df)
result = model.transform(result_df)
result.select("token", "label", "pca_features").show(truncate = 60)
+----------+--------+------------------------------------------------------------+ | token| label| pca_features| +----------+--------+------------------------------------------------------------+ | I|Accounts| [9.850468172808704,0.02182025684995559,1.7128883074588641]| | called|Accounts| [0.5703260311955864,0.346658149631252,-2.867726751670609]| |Huntington|Accounts| [8.635450770647445,0.8802312004740499,-0.8417105564124523]| | Bank|Accounts| [9.391061503515894,0.45066516018168057,-1.2157436459087525]| | to|Accounts| [-2.093784358504493,-1.1261933945050695,4.473374538741789]| | close|Accounts| [-2.897764751048121,-0.1633032944974737,2.6316552582800594]| | my|Accounts| [3.542237747747922,-2.721495573008954,2.847896218683586]| | account|Accounts|[-1.2533257167247633,0.006480340909400874,1.9023215773218...| | ,|Accounts| [-1.371343619695057,0.16043397738672746,2.236148062116737]| | and|Accounts| [0.2574783722223581,-0.39882523377542006,4.898577649457495]| | they|Accounts| [2.649181792582909,-2.0965602813943836,3.0047699978661027]| | refused|Accounts| [-1.447842544994814,-3.120728385057716,1.623718089120733]| | to|Accounts| [-3.476136836992586,-0.955126757467589,5.927975835938944]| | close|Accounts| [-3.22033000871663,-0.09380183797818464,2.502218213385302]| | my|Accounts| [3.0773967959997126,-2.732351171853666,3.4638980333557523]| | account|Accounts|[-1.7590956965611935,0.049200510343874765,2.0855752458205...| | over|Accounts| [-0.7852937839017823,-1.0837250583596254,2.77524528481479]| | the|Accounts| [-1.9152538017789913,-0.5845586090479645,5.449708419677918]| | phone|Accounts| [-3.7960455527563335,-2.527243794119921,1.563631667500876]| | .|Accounts|[-0.04212375650031434,-0.5960727710945473,0.4870793244043...| +----------+--------+------------------------------------------------------------+ only showing top 20 rows
df = result.select('token', 'label', 'pca_features').toPandas()
df
token | label | pca_features | |
---|---|---|---|
0 | I | Accounts | [9.850468172808704, 0.02182025684995559, 1.712... |
1 | called | Accounts | [0.5703260311955864, 0.346658149631252, -2.867... |
2 | Huntington | Accounts | [8.635450770647445, 0.8802312004740499, -0.841... |
3 | Bank | Accounts | [9.391061503515894, 0.45066516018168057, -1.21... |
4 | to | Accounts | [-2.093784358504493, -1.1261933945050695, 4.47... |
... | ... | ... | ... |
1364 | the | Mortgage | [0.20783178705004846, 1.2121685298369587, 2.34... |
1365 | company | Mortgage | [0.9758784877952482, 1.1525640123640015, 1.548... |
1366 | never | Mortgage | [-0.009449827591906173, -1.360506257943843, -0... |
1367 | responds | Mortgage | [-1.3105360623344586, -0.3952000653886483, -1.... |
1368 | . | Mortgage | [1.732371824614684, -14.254692680656397, -4.51... |
1369 rows × 3 columns
df["x"] = df["pca_features"].apply(lambda x: x[0])
df["y"] = df["pca_features"].apply(lambda x: x[1])
df["z"] = df["pca_features"].apply(lambda x: x[2])
df = df[["token", "label", "x", "y", "z"]]
df
token | label | x | y | z | |
---|---|---|---|---|---|
0 | I | Accounts | 9.850468 | 0.021820 | 1.712888 |
1 | called | Accounts | 0.570326 | 0.346658 | -2.867727 |
2 | Huntington | Accounts | 8.635451 | 0.880231 | -0.841711 |
3 | Bank | Accounts | 9.391062 | 0.450665 | -1.215744 |
4 | to | Accounts | -2.093784 | -1.126193 | 4.473375 |
... | ... | ... | ... | ... | ... |
1364 | the | Mortgage | 0.207832 | 1.212169 | 2.345686 |
1365 | company | Mortgage | 0.975878 | 1.152564 | 1.548878 |
1366 | never | Mortgage | -0.009450 | -1.360506 | -0.080957 |
1367 | responds | Mortgage | -1.310536 | -0.395200 | -1.634091 |
1368 | . | Mortgage | 1.732372 | -14.254693 | -4.517188 |
1369 rows × 5 columns
import plotly.express as px
fig = px.scatter_3d(df, x = 'x', y = 'y', z = 'z', color = "label", width=1000, height = 800, hover_data = ["token", "label"])
fig.show()