In this project, we aim to figure out 3 potential hackers based on the information about the hacks. The data description is as follows
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator
spark = SparkSession.builder.appName('hack_find').getOrCreate()
dataset = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/hack_data.csv",header=True,inferSchema=True)
dataset.head()
Out[3]: Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)
dataset.describe().show()
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+ |summary|Session_Connection_Time| Bytes Transferred| Kali_Trace_Used|Servers_Corrupted| Pages_Corrupted| Location| WPM_Typing_Speed| +-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+ | count| 334| 334| 334| 334| 334| 334| 334| | mean| 30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413| null|57.342395209580864| | stddev| 14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697| 3.06352633036022| null| 13.41106336843464| | min| 1.0| 10.0| 0| 1.0| 6.0|Afghanistan| 40.0| | max| 60.0| 1330.5| 1| 10.0| 15.0| Zimbabwe| 75.0| +-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
dataset.columns
Out[6]: ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'Location', 'WPM_Typing_Speed']
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')
final_data = vec_assembler.transform(dataset)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(final_data)
cluster_final_data = scalerModel.transform(final_data)
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)
model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)
wssse_k3 = model_k3.transform(cluster_final_data)
wssse_k2 = model_k2.transform(cluster_final_data)
evaluator = ClusteringEvaluator()
silhouette3 = evaluator.evaluate(wssse_k3)
silhouette2 = evaluator.evaluate(wssse_k2)
print("With K=3")
print("Silhouette with squared euclidean distance = " + str(silhouette3))
print("With K=2")
print("Silhouette with squared euclidean distance = " + str(silhouette2))
With K=3 Silhouette with squared euclidean distance = 0.3068084951287429 With K=2 Silhouette with squared euclidean distance = 0.6683623593283755
wssse_k3.groupBy('prediction').count().show()
wssse_k2.groupBy('prediction').count().show()
+----------+-----+ |prediction|count| +----------+-----+ | 1| 83| | 2| 84| | 0| 167| +----------+-----+ +----------+-----+ |prediction|count| +----------+-----+ | 1| 167| | 0| 167| +----------+-----+