Use the latest versions of DataFrame and KotlinDL libraries from the version repository.
To run this notebook in Kotlin Notebook, please make sure "Resolve multiplatform dependencies" is turned OFF for this library
%useLatestDescriptors
%use dataframe@kc25
%use kotlin-dl
Read the dataframe from CSV and print the first few lines of it
val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|
7.400000 | 0.700000 | 0.000000 | 1.900000 | 0.076000 | 11.000000 | 34.000000 | 0.997800 | 3.510000 | 0.560000 | 9.400000 | 5 |
7.800000 | 0.880000 | 0.000000 | 2.600000 | 0.098000 | 25.000000 | 67.000000 | 0.996800 | 3.200000 | 0.680000 | 9.800000 | 5 |
7.800000 | 0.760000 | 0.040000 | 2.300000 | 0.092000 | 15.000000 | 54.000000 | 0.997000 | 3.260000 | 0.650000 | 9.800000 | 5 |
11.200000 | 0.280000 | 0.560000 | 1.900000 | 0.075000 | 17.000000 | 60.000000 | 0.998000 | 3.160000 | 0.580000 | 9.800000 | 6 |
7.400000 | 0.700000 | 0.000000 | 1.900000 | 0.076000 | 11.000000 | 34.000000 | 0.997800 | 3.510000 | 0.560000 | 9.400000 | 5 |
Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off, or we need to explicitly turn the dataframe into HTML.
rawDf.corr()
.format { colsOf<Double>() }.with { linearBg(value = it, from = -1.0 to red, to = 1.0 to green) }
.toHtml()
Based on the correlation, we can remove some columns, they seem to be insignificant
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df
fixed acidity | volatile acidity | citric acid | chlorides | total sulfur dioxide | density | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|
7.400000 | 0.700000 | 0.000000 | 0.076000 | 34.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.800000 | 0.880000 | 0.000000 | 0.098000 | 67.000000 | 0.996800 | 0.680000 | 9.800000 | 5 |
7.800000 | 0.760000 | 0.040000 | 0.092000 | 54.000000 | 0.997000 | 0.650000 | 9.800000 | 5 |
11.200000 | 0.280000 | 0.560000 | 0.075000 | 60.000000 | 0.998000 | 0.580000 | 9.800000 | 6 |
7.400000 | 0.700000 | 0.000000 | 0.076000 | 34.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.400000 | 0.660000 | 0.000000 | 0.075000 | 40.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.900000 | 0.600000 | 0.060000 | 0.069000 | 59.000000 | 0.996400 | 0.460000 | 9.400000 | 5 |
7.300000 | 0.650000 | 0.000000 | 0.065000 | 21.000000 | 0.994600 | 0.470000 | 10.000000 | 7 |
7.800000 | 0.580000 | 0.020000 | 0.073000 | 18.000000 | 0.996800 | 0.570000 | 9.500000 | 7 |
7.500000 | 0.500000 | 0.360000 | 0.071000 | 102.000000 | 0.997800 | 0.800000 | 10.500000 | 5 |
6.700000 | 0.580000 | 0.080000 | 0.097000 | 65.000000 | 0.995900 | 0.540000 | 9.200000 | 5 |
7.500000 | 0.500000 | 0.360000 | 0.071000 | 102.000000 | 0.997800 | 0.800000 | 10.500000 | 5 |
5.600000 | 0.615000 | 0.000000 | 0.089000 | 59.000000 | 0.994300 | 0.520000 | 9.900000 | 5 |
7.800000 | 0.610000 | 0.290000 | 0.114000 | 29.000000 | 0.997400 | 1.560000 | 9.100000 | 5 |
8.900000 | 0.620000 | 0.180000 | 0.176000 | 145.000000 | 0.998600 | 0.880000 | 9.200000 | 5 |
8.900000 | 0.620000 | 0.190000 | 0.170000 | 148.000000 | 0.998600 | 0.930000 | 9.200000 | 5 |
8.500000 | 0.280000 | 0.560000 | 0.092000 | 103.000000 | 0.996900 | 0.750000 | 10.500000 | 7 |
8.100000 | 0.560000 | 0.280000 | 0.368000 | 56.000000 | 0.996800 | 1.280000 | 9.300000 | 5 |
7.400000 | 0.590000 | 0.080000 | 0.086000 | 29.000000 | 0.997400 | 0.500000 | 9.000000 | 4 |
7.900000 | 0.320000 | 0.510000 | 0.341000 | 56.000000 | 0.996900 | 1.080000 | 9.200000 | 6 |
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
return OnHeapDataset.create(
dataframe = this,
yColumn = labelColumnName
)
}
fun OnHeapDataset.Companion.create(
dataframe: DataFrame<Any?>,
yColumn: String
): OnHeapDataset {
fun extractX(): Array<FloatArray> =
dataframe.remove(yColumn).rows()
.map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()
fun extractY(): FloatArray =
dataframe.get { yColumn<Float>() }.toList().toFloatArray()
return create(
::extractX,
::extractY
)
}
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
.toOnHeapDataset(labelColumnName = "quality")
.split(0.8)
Define simple neural network with only 2 dense layers
val inputNeurons = train.x[0].size.toLong()
val model = Sequential.of(
Input(
inputNeurons,
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
)
)
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
Train it!
val trainHist = model.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0.334851 | [0.45112717151641846] | NaN | [NaN] |
1997 | 0.334814 | [0.45109668374061584] | NaN | [NaN] |
1998 | 0.334778 | [0.45106613636016846] | NaN | [NaN] |
1999 | 0.334741 | [0.45103588700294495] | NaN | [NaN] |
2000 | 0.334705 | [0.45100536942481995] | NaN | [NaN] |
Let's check that our network predicts values more or less correctly:
model.predictSoftly(test.x[9])[0]
5.24972
test.y[9]
5.0
Close the model:
model.close()
data class TrainTestSplitResult<T>(
val trainX: DataFrame<T>,
val trainY: DataFrame<T>,
val testX: DataFrame<T>,
val testY: DataFrame<T>,
)
fun <T> trainTestSplit(
d: DataFrame<T>,
col: String,
trainPart: Double,
): TrainTestSplitResult<T> {
val n = d.count()
val trainN = ceil(n * trainPart).toInt()
val shuffledInd = (0..<n).shuffled()
val trainInd = shuffledInd.subList(0, trainN)
val testInd = shuffledInd.subList(trainN, n)
val train = d[trainInd]
val test = d[testInd]
val trainX = train.select { allExcept(col) }
val trainY = train.select(col)
val testX = test.select { allExcept(col) }
val testY = test.select(col)
return TrainTestSplitResult(trainX, trainY, testX, testY)
}
Let's create and then train the model as we did before
val (trainX, trainY, testX, testY) =
trainTestSplit(df, "quality", 0.8)
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
.get { "X"<FloatArray>() }
.toList()
.toTypedArray()
fun <T> DataFrame<T>.toY(): FloatArray =
get { "quality"<Int>() }
.asIterable()
.map { it.toFloat() }
.toFloatArray()
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })
val inputNeurons = train.x[0].size.toLong()
val model2 = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0.334532 | [0.4508610963821411] | NaN | [NaN] |
1997 | 0.334495 | [0.45082950592041016] | NaN | [NaN] |
1998 | 0.334458 | [0.45079800486564636] | NaN | [NaN] |
1999 | 0.334421 | [0.4507667124271393] | NaN | [NaN] |
2000 | 0.334384 | [0.45073509216308594] | NaN | [NaN] |
model2.predictSoftly(testXDL[9])[0]
5.8768764
testYDL[9]
5.0
We can also compare predicted and ground truth values to ensure predictions are correct
val predicted = testXDL.mapIndexed { i, _ ->
round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")
val ground_truth = testYDL.mapIndexed { i, _ ->
testYDL[i].toInt()
}.toColumn("ground_truth")
val predDf = dataFrameOf(predicted, ground_truth)
predDf.head()
predicted | ground_truth |
---|---|
6 | 6 |
5 | 4 |
6 | 6 |
5 | 5 |
5 | 5 |
val inds = List(10) { it + 1 }
val ctab = predDf
.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
.sortBy { ground_truth }
ctab.format { drop(1) }.perRowCol { row, col ->
val y = col.name().toInt()
val x = row.ground_truth
val k = 1.0 - abs(x - y) / 10.0
background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}.toHtml()
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }
predDf2.avg_dev.cast<Double>().describe()
name | type | count | unique | nulls | top | freq | mean | std | min | p25 | median | p75 | max |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
avg_dev | Int | 319 | 3 | 0 | 0 | 196 | 0.407524 | 0.535007 | 0 | 0.000000 | 0.000000 | 1.000000 | 2 |
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]
predicted | ground_truth | avg_dev |
---|---|---|
6 | 5 | 1 |
model2.close()