Use the latest versions of DataFrame and KotlinDL libraries from version repository.
%useLatestDescriptors
%use dataframe, kotlin-dl
Read the dataframe from CSV and print the first few lines of it
val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|
7,400000 | 0,700000 | 0,000000 | 1,900000 | 0,076000 | 11,000000 | 34,000000 | 0,997800 | 3,510000 | 0,560000 | 9,400000 | 5 |
7,800000 | 0,880000 | 0,000000 | 2,600000 | 0,098000 | 25,000000 | 67,000000 | 0,996800 | 3,200000 | 0,680000 | 9,800000 | 5 |
7,800000 | 0,760000 | 0,040000 | 2,300000 | 0,092000 | 15,000000 | 54,000000 | 0,997000 | 3,260000 | 0,650000 | 9,800000 | 5 |
11,200000 | 0,280000 | 0,560000 | 1,900000 | 0,075000 | 17,000000 | 60,000000 | 0,998000 | 3,160000 | 0,580000 | 9,800000 | 6 |
7,400000 | 0,700000 | 0,000000 | 1,900000 | 0,076000 | 11,000000 | 34,000000 | 0,997800 | 3,510000 | 0,560000 | 9,400000 | 5 |
Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.
rawDf.corr().format { colsOf<Double>() }.with {
linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}
column | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|---|
fixed acidity | 1,000000 | -0,256131 | 0,671703 | 0,114777 | 0,093705 | -0,153794 | -0,113181 | 0,668047 | -0,682978 | 0,183006 | -0,061668 | 0,124052 |
volatile acidity | -0,256131 | 1,000000 | -0,552496 | 0,001918 | 0,061298 | -0,010504 | 0,076470 | 0,022026 | 0,234937 | -0,260987 | -0,202288 | -0,390558 |
citric acid | 0,671703 | -0,552496 | 1,000000 | 0,143577 | 0,203823 | -0,060978 | 0,035533 | 0,364947 | -0,541904 | 0,312770 | 0,109903 | 0,226373 |
residual sugar | 0,114777 | 0,001918 | 0,143577 | 1,000000 | 0,055610 | 0,187049 | 0,203028 | 0,355283 | -0,085652 | 0,005527 | 0,042075 | 0,013732 |
chlorides | 0,093705 | 0,061298 | 0,203823 | 0,055610 | 1,000000 | 0,005562 | 0,047400 | 0,200632 | -0,265026 | 0,371260 | -0,221141 | -0,128907 |
free sulfur dioxide | -0,153794 | -0,010504 | -0,060978 | 0,187049 | 0,005562 | 1,000000 | 0,667666 | -0,021946 | 0,070377 | 0,051658 | -0,069408 | -0,050656 |
total sulfur dioxide | -0,113181 | 0,076470 | 0,035533 | 0,203028 | 0,047400 | 0,667666 | 1,000000 | 0,071269 | -0,066495 | 0,042947 | -0,205654 | -0,185100 |
density | 0,668047 | 0,022026 | 0,364947 | 0,355283 | 0,200632 | -0,021946 | 0,071269 | 1,000000 | -0,341699 | 0,148506 | -0,496180 | -0,174919 |
pH | -0,682978 | 0,234937 | -0,541904 | -0,085652 | -0,265026 | 0,070377 | -0,066495 | -0,341699 | 1,000000 | -0,196648 | 0,205633 | -0,057731 |
sulphates | 0,183006 | -0,260987 | 0,312770 | 0,005527 | 0,371260 | 0,051658 | 0,042947 | 0,148506 | -0,196648 | 1,000000 | 0,093595 | 0,251397 |
alcohol | -0,061668 | -0,202288 | 0,109903 | 0,042075 | -0,221141 | -0,069408 | -0,205654 | -0,496180 | 0,205633 | 0,093595 | 1,000000 | 0,476166 |
quality | 0,124052 | -0,390558 | 0,226373 | 0,013732 | -0,128907 | -0,050656 | -0,185100 | -0,174919 | -0,057731 | 0,251397 | 0,476166 | 1,000000 |
Based on the correlation, we can remove some columns, they seem to be insignificant
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df
fixed acidity | volatile acidity | citric acid | chlorides | total sulfur dioxide | density | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|
7,400000 | 0,700000 | 0,000000 | 0,076000 | 34,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,800000 | 0,880000 | 0,000000 | 0,098000 | 67,000000 | 0,996800 | 0,680000 | 9,800000 | 5 |
7,800000 | 0,760000 | 0,040000 | 0,092000 | 54,000000 | 0,997000 | 0,650000 | 9,800000 | 5 |
11,200000 | 0,280000 | 0,560000 | 0,075000 | 60,000000 | 0,998000 | 0,580000 | 9,800000 | 6 |
7,400000 | 0,700000 | 0,000000 | 0,076000 | 34,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,400000 | 0,660000 | 0,000000 | 0,075000 | 40,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,900000 | 0,600000 | 0,060000 | 0,069000 | 59,000000 | 0,996400 | 0,460000 | 9,400000 | 5 |
7,300000 | 0,650000 | 0,000000 | 0,065000 | 21,000000 | 0,994600 | 0,470000 | 10,000000 | 7 |
7,800000 | 0,580000 | 0,020000 | 0,073000 | 18,000000 | 0,996800 | 0,570000 | 9,500000 | 7 |
7,500000 | 0,500000 | 0,360000 | 0,071000 | 102,000000 | 0,997800 | 0,800000 | 10,500000 | 5 |
6,700000 | 0,580000 | 0,080000 | 0,097000 | 65,000000 | 0,995900 | 0,540000 | 9,200000 | 5 |
7,500000 | 0,500000 | 0,360000 | 0,071000 | 102,000000 | 0,997800 | 0,800000 | 10,500000 | 5 |
5,600000 | 0,615000 | 0,000000 | 0,089000 | 59,000000 | 0,994300 | 0,520000 | 9,900000 | 5 |
7,800000 | 0,610000 | 0,290000 | 0,114000 | 29,000000 | 0,997400 | 1,560000 | 9,100000 | 5 |
8,900000 | 0,620000 | 0,180000 | 0,176000 | 145,000000 | 0,998600 | 0,880000 | 9,200000 | 5 |
8,900000 | 0,620000 | 0,190000 | 0,170000 | 148,000000 | 0,998600 | 0,930000 | 9,200000 | 5 |
8,500000 | 0,280000 | 0,560000 | 0,092000 | 103,000000 | 0,996900 | 0,750000 | 10,500000 | 7 |
8,100000 | 0,560000 | 0,280000 | 0,368000 | 56,000000 | 0,996800 | 1,280000 | 9,300000 | 5 |
7,400000 | 0,590000 | 0,080000 | 0,086000 | 29,000000 | 0,997400 | 0,500000 | 9,000000 | 4 |
7,900000 | 0,320000 | 0,510000 | 0,341000 | 56,000000 | 0,996900 | 1,080000 | 9,200000 | 6 |
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
return OnHeapDataset.create(
dataframe = this,
yColumn = labelColumnName
)
}
fun OnHeapDataset.Companion.create(
dataframe: DataFrame<Any?>,
yColumn: String
): OnHeapDataset {
fun extractX(): Array<FloatArray> =
dataframe.remove(yColumn).rows()
.map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()
fun extractY(): FloatArray =
dataframe.get { yColumn<Float>() }.toList().toFloatArray()
return create(
::extractX,
::extractY
)
}
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
.toOnHeapDataset(labelColumnName = "quality")
.split(0.8)
Define simple neural network with only 2 dense layers
val inputNeurons = train.x[0].size.toLong()
val model = Sequential.of(
Input(
inputNeurons,
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
)
)
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
Train it!
val trainHist = model.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0,334449 | [0.4508064091205597] | NaN | [NaN] |
1997 | 0,334411 | [0.4507730007171631] | NaN | [NaN] |
1998 | 0,334372 | [0.45073962211608887] | NaN | [NaN] |
1999 | 0,334334 | [0.4507063925266266] | NaN | [NaN] |
2000 | 0,334295 | [0.4506729543209076] | NaN | [NaN] |
Let's check that our network predicts values more or less correctly:
model.predictSoftly(test.x[9])[0]
5.2483034
test.y[9]
5.0
Close the model:
model.close()
data class TrainTestSplitResult<T>(
val trainX: DataFrame<T>,
val trainY: DataFrame<T>,
val testX: DataFrame<T>,
val testY: DataFrame<T>,
)
fun <T> trainTestSplit(
d: DataFrame<T>,
col: String,
trainPart: Double,
): TrainTestSplitResult<T> {
val n = d.count()
val trainN = ceil(n * trainPart).toInt()
val shuffledInd = (0 until n).shuffled()
val trainInd = shuffledInd.subList(0, trainN)
val testInd = shuffledInd.subList(trainN, n)
val train = d[trainInd]
val test = d[testInd]
val trainX = train.select { all().except(cols(col)) }
val trainY = train.select(col)
val testX = test.select { all().except(cols(col)) }
val testY = test.select(col)
return TrainTestSplitResult(trainX, trainY, testX, testY)
}
Let's create and then train the model as we did before
val (trainX, trainY, testX, testY) =
trainTestSplit(df, "quality", 0.8)
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
.get { "X"<FloatArray>() }
.toList()
.toTypedArray()
fun <T> DataFrame<T>.toY(): FloatArray =
get { "quality"<Int>() }
.asIterable()
.map { it.toFloat() }
.toFloatArray()
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })
val inputNeurons = train.x[0].size.toLong()
val model2 = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()
============================================================================== Model type: Sequential ______________________________________________________________________________ Layer (type) Output Shape Param # ============================================================================== input_1(Input) [None, 8] 0 ______________________________________________________________________________ dense_2(Dense) [None, 80] 720 ______________________________________________________________________________ dense_3(Dense) [None, 80] 6480 ______________________________________________________________________________ dense_4(Dense) [None, 1] 81 ______________________________________________________________________________ ============================================================================== Total trainable params: 7281 Total frozen params: 0 Total params: 7281 ______________________________________________________________________________
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()
epochIndex | lossValue | metricValues | valLossValue | valMetricValues |
---|---|---|---|---|
1996 | 0,334634 | [0.4510577917098999] | NaN | [NaN] |
1997 | 0,334596 | [0.4510258734226227] | NaN | [NaN] |
1998 | 0,334558 | [0.4509936571121216] | NaN | [NaN] |
1999 | 0,334520 | [0.45096150040626526] | NaN | [NaN] |
2000 | 0,334482 | [0.4509289562702179] | NaN | [NaN] |
model2.predictSoftly(testXDL[9])[0]
5.874521
testYDL[9]
6.0
We can also compare predicted and ground truth values to ensure predictions are correct
val predicted = testXDL.mapIndexed { i, _ ->
round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")
val ground_truth = testYDL.mapIndexed { i, _ ->
testYDL[i].toInt()
}.toColumn("ground_truth")
val predDf = dataFrameOf(predicted, ground_truth)
predDf.head()
predicted | ground_truth |
---|---|
5 | 6 |
5 | 5 |
6 | 7 |
6 | 6 |
6 | 5 |
val inds = List(10) { it + 1 }
val ctab = predDf
.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
.sortBy { ground_truth }
ctab.format { drop(1) }.perRowCol { row, col ->
val y = col.name().toInt()
val x = row.ground_truth
val k = 1.0 - abs(x - y) / 10.0
background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}
ground_truth | 5 | 6 | 7 | 4 |
---|---|---|---|---|
3 | 1 | 0 | 0 | 2 |
4 | 6 | 1 | 0 | 2 |
5 | 99 | 30 | 0 | 0 |
6 | 29 | 90 | 10 | 0 |
7 | 0 | 29 | 15 | 0 |
8 | 0 | 1 | 4 | 0 |
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }
predDf2.avg_dev.cast<Double>().describe()
name | type | count | unique | nulls | top | freq | mean | std | min | median | max |
---|---|---|---|---|---|---|---|---|---|---|---|
avg_dev | Int | 319 | 3 | 0 | 0 | 206 | 0,363636 | 0,501000 | 0 | 0 | 2 |
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]
predicted | ground_truth | avg_dev |
---|---|---|
6 | 7 | 1 |
model2.close()