from pyspark.sql import SparkSession from pyspark.ml.clustering import KMeans from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler from pyspark.ml.evaluation import ClusteringEvaluator spark = SparkSession.builder.appName('hack_find').getOrCreate() dataset = spark.read.csv("dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/hack_data.csv",header=True,inferSchema=True) dataset.head() dataset.describe().show() dataset.columns feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed'] vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features') final_data = vec_assembler.transform(dataset) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scalerModel = scaler.fit(final_data) cluster_final_data = scalerModel.transform(final_data) kmeans3 = KMeans(featuresCol='scaledFeatures',k=3) kmeans2 = KMeans(featuresCol='scaledFeatures',k=2) model_k3 = kmeans3.fit(cluster_final_data) model_k2 = kmeans2.fit(cluster_final_data) wssse_k3 = model_k3.transform(cluster_final_data) wssse_k2 = model_k2.transform(cluster_final_data) evaluator = ClusteringEvaluator() silhouette3 = evaluator.evaluate(wssse_k3) silhouette2 = evaluator.evaluate(wssse_k2) print("With K=3") print("Silhouette with squared euclidean distance = " + str(silhouette3)) print("With K=2") print("Silhouette with squared euclidean distance = " + str(silhouette2)) wssse_k3.groupBy('prediction').count().show() wssse_k2.groupBy('prediction').count().show()