Use the latest versions of DataFrame and KotlinDL libraries from version repository.
%useLatestDescriptors
%use dataframe, kotlin-dl
Read the dataframe from CSV and print the first few lines of it
val raw_df = DataFrame.readCSV(fileOrUrl = "winequality-red.csv", delimiter = ';')
raw_df.head()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|
7.400000 | 0.700000 | 0.000000 | 1.900000 | 0.076000 | 11.000000 | 34.000000 | 0.997800 | 3.510000 | 0.560000 | 9.400000 | 5 |
7.800000 | 0.880000 | 0.000000 | 2.600000 | 0.098000 | 25.000000 | 67.000000 | 0.996800 | 3.200000 | 0.680000 | 9.800000 | 5 |
7.800000 | 0.760000 | 0.040000 | 2.300000 | 0.092000 | 15.000000 | 54.000000 | 0.997000 | 3.260000 | 0.650000 | 9.800000 | 5 |
11.200000 | 0.280000 | 0.560000 | 1.900000 | 0.075000 | 17.000000 | 60.000000 | 0.998000 | 3.160000 | 0.580000 | 9.800000 | 6 |
7.400000 | 0.700000 | 0.000000 | 1.900000 | 0.076000 | 11.000000 | 34.000000 | 0.997800 | 3.510000 | 0.560000 | 9.400000 | 5 |
Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.
raw_df.corr().format { colsOf<Double>() }.with {
linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}
column | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|---|
fixed acidity | 1.000000 | -0.256131 | 0.671703 | 0.114777 | 0.093705 | -0.153794 | -0.113181 | 0.668047 | -0.682978 | 0.183006 | -0.061668 | 0.124052 |
volatile acidity | -0.256131 | 1.000000 | -0.552496 | 0.001918 | 0.061298 | -0.010504 | 0.076470 | 0.022026 | 0.234937 | -0.260987 | -0.202288 | -0.390558 |
citric acid | 0.671703 | -0.552496 | 1.000000 | 0.143577 | 0.203823 | -0.060978 | 0.035533 | 0.364947 | -0.541904 | 0.312770 | 0.109903 | 0.226373 |
residual sugar | 0.114777 | 0.001918 | 0.143577 | 1.000000 | 0.055610 | 0.187049 | 0.203028 | 0.355283 | -0.085652 | 0.005527 | 0.042075 | 0.013732 |
chlorides | 0.093705 | 0.061298 | 0.203823 | 0.055610 | 1.000000 | 0.005562 | 0.047400 | 0.200632 | -0.265026 | 0.371260 | -0.221141 | -0.128907 |
free sulfur dioxide | -0.153794 | -0.010504 | -0.060978 | 0.187049 | 0.005562 | 1.000000 | 0.667666 | -0.021946 | 0.070377 | 0.051658 | -0.069408 | -0.050656 |
total sulfur dioxide | -0.113181 | 0.076470 | 0.035533 | 0.203028 | 0.047400 | 0.667666 | 1.000000 | 0.071269 | -0.066495 | 0.042947 | -0.205654 | -0.185100 |
density | 0.668047 | 0.022026 | 0.364947 | 0.355283 | 0.200632 | -0.021946 | 0.071269 | 1.000000 | -0.341699 | 0.148506 | -0.496180 | -0.174919 |
pH | -0.682978 | 0.234937 | -0.541904 | -0.085652 | -0.265026 | 0.070377 | -0.066495 | -0.341699 | 1.000000 | -0.196648 | 0.205633 | -0.057731 |
sulphates | 0.183006 | -0.260987 | 0.312770 | 0.005527 | 0.371260 | 0.051658 | 0.042947 | 0.148506 | -0.196648 | 1.000000 | 0.093595 | 0.251397 |
alcohol | -0.061668 | -0.202288 | 0.109903 | 0.042075 | -0.221141 | -0.069408 | -0.205654 | -0.496180 | 0.205633 | 0.093595 | 1.000000 | 0.476166 |
quality | 0.124052 | -0.390558 | 0.226373 | 0.013732 | -0.128907 | -0.050656 | -0.185100 | -0.174919 | -0.057731 | 0.251397 | 0.476166 | 1.000000 |
Based on the correlation, we can remove some columns, they seem to be insignificant
val df = raw_df.remove { `free sulfur dioxide` and `residual sugar` and pH }
df
fixed acidity | volatile acidity | citric acid | chlorides | total sulfur dioxide | density | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|
7.400000 | 0.700000 | 0.000000 | 0.076000 | 34.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.800000 | 0.880000 | 0.000000 | 0.098000 | 67.000000 | 0.996800 | 0.680000 | 9.800000 | 5 |
7.800000 | 0.760000 | 0.040000 | 0.092000 | 54.000000 | 0.997000 | 0.650000 | 9.800000 | 5 |
11.200000 | 0.280000 | 0.560000 | 0.075000 | 60.000000 | 0.998000 | 0.580000 | 9.800000 | 6 |
7.400000 | 0.700000 | 0.000000 | 0.076000 | 34.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.400000 | 0.660000 | 0.000000 | 0.075000 | 40.000000 | 0.997800 | 0.560000 | 9.400000 | 5 |
7.900000 | 0.600000 | 0.060000 | 0.069000 | 59.000000 | 0.996400 | 0.460000 | 9.400000 | 5 |
7.300000 | 0.650000 | 0.000000 | 0.065000 | 21.000000 | 0.994600 | 0.470000 | 10.000000 | 7 |
7.800000 | 0.580000 | 0.020000 | 0.073000 | 18.000000 | 0.996800 | 0.570000 | 9.500000 | 7 |
7.500000 | 0.500000 | 0.360000 | 0.071000 | 102.000000 | 0.997800 | 0.800000 | 10.500000 | 5 |
6.700000 | 0.580000 | 0.080000 | 0.097000 | 65.000000 | 0.995900 | 0.540000 | 9.200000 | 5 |
7.500000 | 0.500000 | 0.360000 | 0.071000 | 102.000000 | 0.997800 | 0.800000 | 10.500000 | 5 |
5.600000 | 0.615000 | 0.000000 | 0.089000 | 59.000000 | 0.994300 | 0.520000 | 9.900000 | 5 |
7.800000 | 0.610000 | 0.290000 | 0.114000 | 29.000000 | 0.997400 | 1.560000 | 9.100000 | 5 |
8.900000 | 0.620000 | 0.180000 | 0.176000 | 145.000000 | 0.998600 | 0.880000 | 9.200000 | 5 |
8.900000 | 0.620000 | 0.190000 | 0.170000 | 148.000000 | 0.998600 | 0.930000 | 9.200000 | 5 |
8.500000 | 0.280000 | 0.560000 | 0.092000 | 103.000000 | 0.996900 | 0.750000 | 10.500000 | 7 |
8.100000 | 0.560000 | 0.280000 | 0.368000 | 56.000000 | 0.996800 | 1.280000 | 9.300000 | 5 |
7.400000 | 0.590000 | 0.080000 | 0.086000 | 29.000000 | 0.997400 | 0.500000 | 9.000000 | 4 |
7.900000 | 0.320000 | 0.510000 | 0.341000 | 56.000000 | 0.996900 | 1.080000 | 9.200000 | 6 |
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
return OnHeapDataset.create(
dataframe = this,
yColumn = labelColumnName
)
}
fun OnHeapDataset.Companion.create(
dataframe: DataFrame<Any?>,
yColumn: String
): OnHeapDataset {
fun extractX(): Array<FloatArray> =
dataframe.remove(yColumn).rows()
.map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()
fun extractY(): FloatArray =
dataframe.get { yColumn<Float>() }.toList().toFloatArray()
return create(
::extractX,
::extractY
)
}
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
.toOnHeapDataset(labelColumnName = "quality")
.split(0.8)
Define simple neural network with only 2 dense layers
val inputNeurons = train.x[0].size.toLong()
val model = Sequential.of(
Input(
inputNeurons,
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
)
)
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
Train it!
val trainHist = model.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0.334877 | [0.45114806294441223] | NaN | [NaN] |
1997 | 0.334841 | [0.45111772418022156] | NaN | [NaN] |
1998 | 0.334805 | [0.45108696818351746] | NaN | [NaN] |
1999 | 0.334768 | [0.45105621218681335] | NaN | [NaN] |
2000 | 0.334732 | [0.4510253965854645] | NaN | [NaN] |
Let's check that our network predicts values more or less correctly:
model.predictSoftly(test.x[9])[0]
5.2477317
test.y[9]
5.0
Close the model:
model.close()
data class TrainTestSplitResult<T>(
val trainX: DataFrame<T>,
val trainY: DataFrame<T>,
val testX: DataFrame<T>,
val testY: DataFrame<T>,
)
fun <T> trainTestSplit(
d: DataFrame<T>,
col: String,
trainPart: Double,
): TrainTestSplitResult<T> {
val n = d.count()
val trainN = ceil(n * trainPart).toInt()
val shuffledInd = (0 until n).shuffled()
val trainInd = shuffledInd.subList(0, trainN)
val testInd = shuffledInd.subList(trainN, n)
val train = d[trainInd]
val test = d[testInd]
val trainX = train.select { all().except(cols(col)) }
val trainY = train.select(col)
val testX = test.select { all().except(cols(col)) }
val testY = test.select(col)
return TrainTestSplitResult(trainX, trainY, testX, testY)
}
Let's create and then train the model as we did before
val (trainX, trainY, testX, testY) =
trainTestSplit(df, "quality", 0.8)
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
.get { "X"<FloatArray>() }
.toList()
.toTypedArray()
fun <T> DataFrame<T>.toY(): FloatArray =
get { "quality"<Int>() }
.asIterable()
.map { it.toFloat() }
.toFloatArray()
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })
val inputNeurons = train.x[0].size.toLong()
val model2 = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0.334773 | [0.45107388496398926] | NaN | [NaN] |
1997 | 0.334737 | [0.45104312896728516] | NaN | [NaN] |
1998 | 0.334700 | [0.45101237297058105] | NaN | [NaN] |
1999 | 0.334663 | [0.4509815275669098] | NaN | [NaN] |
2000 | 0.334626 | [0.45095062255859375] | NaN | [NaN] |
model2.predictSoftly(testXDL[9])[0]
6.6911993
testYDL[9]
7.0
We can also compare predicted and ground truth values to ensure predictions are correct
val predicted = testXDL.mapIndexed { i, _ ->
round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")
val ground_truth = testYDL.mapIndexed { i, _ ->
testYDL[i].toInt()
}.toColumn("ground_truth")
val predDf = dataFrameOf(predicted, ground_truth)
predDf.head()
predicted | ground_truth |
---|---|
5 | 5 |
5 | 5 |
6 | 5 |
5 | 5 |
6 | 6 |
val inds = List(10) { it + 1 }
val ctab = predDf
.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
.sortBy { ground_truth }
ctab.format { drop(1) }.perRowCol { row, col ->
val y = col.name().toInt()
val x = row.ground_truth
val k = 1.0 - abs(x - y) / 10.0
background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}
ground_truth | 5 | 6 | 4 | 7 |
---|---|---|---|---|
3 | 2 | 0 | 1 | 0 |
4 | 8 | 2 | 1 | 0 |
5 | 105 | 42 | 1 | 0 |
6 | 34 | 78 | 0 | 5 |
7 | 0 | 20 | 0 | 16 |
8 | 0 | 1 | 0 | 3 |
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }
predDf2.avg_dev.cast<Double>().describe()
name | type | count | unique | nulls | top | freq | mean | std | min | median | max |
---|---|---|---|---|---|---|---|---|---|---|---|
avg_dev | Int | 319 | 3 | 0 | 0 | 200 | 0.388715 | 0.519432 | 0 | 0 | 2 |
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]
predicted | ground_truth | avg_dev |
---|---|---|
6 | 5 | 1 |
model2.close()