%useLatestDescriptors %use dataframe %use kotlin-dl val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';') rawDf.head() rawDf.corr() .format { colsOf() }.with { linearBg(value = it, from = -1.0 to red, to = 1.0 to green) } .toHtml() val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH } df // Simple converter function between DataFrame and KotlinDL data representations fun DataFrame.toOnHeapDataset(labelColumnName: String): OnHeapDataset { return OnHeapDataset.create( dataframe = this, yColumn = labelColumnName ) } fun OnHeapDataset.Companion.create( dataframe: DataFrame, yColumn: String ): OnHeapDataset { fun extractX(): Array = dataframe.remove(yColumn).rows() .map { (it.values() as List).toFloatArray() }.toTypedArray() fun extractY(): FloatArray = dataframe.get { yColumn() }.toList().toFloatArray() return create( ::extractX, ::extractY ) } val (train, test) = df.convert { colsOf() }.toFloat() .toOnHeapDataset(labelColumnName = "quality") .split(0.8) val inputNeurons = train.x[0].size.toLong() val model = Sequential.of( Input( inputNeurons, ), Dense( outputSize = (inputNeurons * 10).toInt(), activation = Activations.Tanh, kernelInitializer = HeNormal(), biasInitializer = HeNormal(), ), Dense( outputSize = (inputNeurons * 10).toInt(), activation = Activations.Tanh, kernelInitializer = HeNormal(), biasInitializer = HeNormal(), ), Dense( outputSize = 1, activation = Activations.Linear, kernelInitializer = HeNormal(), biasInitializer = HeNormal(), ) ) model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE) model.printSummary() val trainHist = model.fit(train, batchSize = 500, epochs=2000) trainHist.epochHistory.toDataFrame().tail() model.predictSoftly(test.x[9])[0] test.y[9] model.close() data class TrainTestSplitResult( val trainX: DataFrame, val trainY: DataFrame, val testX: DataFrame, val testY: DataFrame, ) fun trainTestSplit( d: DataFrame, col: String, trainPart: Double, ): TrainTestSplitResult { val n = d.count() val trainN = ceil(n * trainPart).toInt() val shuffledInd = (0.. DataFrame.toX(): Array = merge { colsOf() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X") .get { "X"() } .toList() .toTypedArray() fun DataFrame.toY(): FloatArray = get { "quality"() } .asIterable() .map { it.toFloat() } .toFloatArray() val trainXDL = trainX.toX() val trainYDL = trainY.toY() val testXDL = testX.toX() val testYDL = testY.toY() val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL }) val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL }) val inputNeurons = train.x[0].size.toLong() val model2 = Sequential.of( Input( inputNeurons ), Dense( outputSize = (inputNeurons * 10).toInt(), activation = Activations.Tanh, kernelInitializer = HeNormal(), biasInitializer = HeNormal() ), Dense( outputSize = (inputNeurons * 10).toInt(), activation = Activations.Tanh, kernelInitializer = HeNormal(), biasInitializer = HeNormal() ), Dense( outputSize = 1, activation = Activations.Linear, kernelInitializer = HeNormal(), biasInitializer = HeNormal() ) ) model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE) model2.printSummary() val trainHist = model2.fit(train, batchSize = 500, epochs = 2000) trainHist.epochHistory.toDataFrame().tail() model2.predictSoftly(testXDL[9])[0] testYDL[9] val predicted = testXDL.mapIndexed { i, _ -> round(model2.predictSoftly(testXDL[i])[0]).toInt() }.toColumn("predicted") val ground_truth = testYDL.mapIndexed { i, _ -> testYDL[i].toInt() }.toColumn("ground_truth") val predDf = dataFrameOf(predicted, ground_truth) predDf.head() val inds = List(10) { it + 1 } val ctab = predDf .groupBy { ground_truth }.pivotCounts(inward = false) { predicted } .sortBy { ground_truth } ctab.format { drop(1) }.perRowCol { row, col -> val y = col.name().toInt() val x = row.ground_truth val k = 1.0 - abs(x - y) / 10.0 background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50)) }.toHtml() val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) } predDf2.avg_dev.cast().describe() predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()] model2.close()