Use the latest versions of DataFrame and KotlinDL libraries from version repository.
%useLatestDescriptors
%use dataframe, kotlin-dl
Read the dataframe from CSV and print the first few lines of it
val raw_df = DataFrame.readCSV(fileOrUrl = "winequality-red.csv", delimiter = ';')
raw_df.head()
DataFrame: rowsCount = 5, columnsCount = 12
raw_df.corr().format { colsOf<Double>() }.with { linearBg(it, -1.0 to red, 1.0 to green) }
DataFrame: rowsCount = 12, columnsCount = 13
Based on the correlation, we can remove some columns, they seem to be insignificant
val df = raw_df.remove {`free sulfur dioxide` and `residual sugar` and pH }
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
return OnHeapDataset.create(
dataframe = this,
yColumn = labelColumnName
)
}
fun OnHeapDataset.Companion.create(
dataframe: DataFrame<Any?>,
yColumn: String
): OnHeapDataset {
fun extractX(): Array<FloatArray> =
dataframe.remove(yColumn).rows()
.map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()
fun extractY(): FloatArray =
dataframe.get { yColumn<Float>() }.toList().toFloatArray()
return create(
::extractX,
::extractY
)
}
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
.toOnHeapDataset(labelColumnName = "quality")
.split(0.8)
Define simple neural network with only 2 dense layers
val inputNeurons = train.x[0].size.toLong()
val model = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ==============================================================================
Train it!
val trainHist = model.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
DataFrame: rowsCount = 5, columnsCount = 5
Let's check that our network predicts values more or less correctly:
model.predictSoftly(test.x[9])[0]
5.24826
test.y[9]
5.0
Close the model:
model.close()
fun <T> trainTestSplit(d: DataFrame<T>, col: String, trainPart: Double): Pair<Pair<DataFrame<T>, DataFrame<T>>, Pair<DataFrame<T>, DataFrame<T>>> {
val n = d.count()
val trainN = ceil(n * trainPart).toInt()
val shuffledInd = (0 until n).shuffled()
val trainInd = shuffledInd.subList(0, trainN)
val testInd = shuffledInd.subList(trainN, n)
val train = d[trainInd]
val test = d[testInd]
val trainX = train.select { all().except(cols(col)) }
val trainY = train.select(col)
val testX = test.select { all().except(cols(col)) }
val testY = test.select(col)
return (trainX to trainY) to (testX to testY)
}
Let's create and then train the model as we did before
val (trainPair, testPair) = trainTestSplit(df, "quality", 0.8)
val (trainX, trainY) = trainPair
val (testX, testY) = testPair
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
.get { "X"<FloatArray>() }.toList().toTypedArray()
fun <T> DataFrame<T>.toY() = get { "quality"<Int>() }.asIterable().map { it.toFloat() }.toFloatArray()
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()
val trainKotlinDLDataset = OnHeapDataset.create({trainXDL}, {trainYDL})
val testKotlinDLDataset = OnHeapDataset.create({testXDL}, {testYDL})
val inputNeurons = train.x[0].size.toLong()
val model2 = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ==============================================================================
val trainHist = model2.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
DataFrame: rowsCount = 5, columnsCount = 5
model2.predictSoftly(testXDL[9])[0]
5.9178224
testYDL[9]
6.0
We can also compare predicted and ground truth values to ensure predictions are correct
val predicted = testXDL.mapIndexed { i, _ ->
round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")
val ground_truth = testYDL.mapIndexed { i, _ ->
testYDL[i].toInt()
}.toColumn("ground_truth")
val predDf = dataFrameOf(predicted, ground_truth)
predDf.head()
DataFrame: rowsCount = 5, columnsCount = 2
val inds = List(10){it + 1}
val ctab = predDf.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }.sortBy { ground_truth }
ctab.format { drop(1) }.perRowCol { row, col ->
val y = col.name().toInt()
val x = row.ground_truth
val k = 1.0 - abs(x - y)/10.0
background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}
DataFrame: rowsCount = 6, columnsCount = 5
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }
predDf2.avg_dev.cast<Double>().describe()
DataFrame: rowsCount = 1, columnsCount = 12
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]
DataRow: index = 222, columnsCount = 3
model2.close()