Notebook

Predict wine quality¶

Use the latest versions of DataFrame and KotlinDL libraries from version repository.

In [1]:

%use dataframe

In [2]:

%use kotlin-dl

Read the dataframe from CSV and print the first few lines of it

In [3]:

val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()

Out[3]:

fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
7,400000	0,700000	0,000000	1,900000	0,076000	11,000000	34,000000	0,997800	3,510000	0,560000	9,400000	5
7,800000	0,880000	0,000000	2,600000	0,098000	25,000000	67,000000	0,996800	3,200000	0,680000	9,800000	5
7,800000	0,760000	0,040000	2,300000	0,092000	15,000000	54,000000	0,997000	3,260000	0,650000	9,800000	5
11,200000	0,280000	0,560000	1,900000	0,075000	17,000000	60,000000	0,998000	3,160000	0,580000	9,800000	6
7,400000	0,700000	0,000000	1,900000	0,076000	11,000000	34,000000	0,997800	3,510000	0,560000	9,400000	5

Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.

In [4]:

rawDf.corr().format { colsOf<Double>() }.with { 
    linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}

Out[4]:

column	fixed acidity	volatile acidity	residual sugar	chlorides	density	pH	sulphates	alcohol	quality
fixed acidity	1,000000	-0,256131	0,114777	0,093705	0,668047	-0,682978	0,183006	-0,061668	0,124052
volatile acidity	-0,256131	1,000000	0,001918	0,061298	0,022026	0,234937	-0,260987	-0,202288	-0,390558
residual sugar	0,114777	0,001918	1,000000	0,055610	0,355283	-0,085652	0,005527	0,042075	0,013732
chlorides	0,093705	0,061298	0,055610	1,000000	0,200632	-0,265026	0,371260	-0,221141	-0,128907
density	0,668047	0,022026	0,355283	0,200632	1,000000	-0,341699	0,148506	-0,496180	-0,174919
pH	-0,682978	0,234937	-0,085652	-0,265026	-0,341699	1,000000	-0,196648	0,205633	-0,057731
sulphates	0,183006	-0,260987	0,005527	0,371260	0,148506	-0,196648	1,000000	0,093595	0,251397
alcohol	-0,061668	-0,202288	0,042075	-0,221141	-0,496180	0,205633	0,093595	1,000000	0,476166
quality	0,124052	-0,390558	0,013732	-0,128907	-0,174919	-0,057731	0,251397	0,476166	1,000000

Based on the correlation, we can remove some columns, they seem to be insignificant

In [5]:

val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df

Out[5]:

fixed acidity	volatile acidity	citric acid	chlorides	total sulfur dioxide	density	sulphates	alcohol	quality
7,400000	0,700000	0,000000	0,076000	34,000000	0,997800	0,560000	9,400000	5
7,800000	0,880000	0,000000	0,098000	67,000000	0,996800	0,680000	9,800000	5
7,800000	0,760000	0,040000	0,092000	54,000000	0,997000	0,650000	9,800000	5
11,200000	0,280000	0,560000	0,075000	60,000000	0,998000	0,580000	9,800000	6
7,400000	0,700000	0,000000	0,076000	34,000000	0,997800	0,560000	9,400000	5
7,400000	0,660000	0,000000	0,075000	40,000000	0,997800	0,560000	9,400000	5
7,900000	0,600000	0,060000	0,069000	59,000000	0,996400	0,460000	9,400000	5
7,300000	0,650000	0,000000	0,065000	21,000000	0,994600	0,470000	10,000000	7
7,800000	0,580000	0,020000	0,073000	18,000000	0,996800	0,570000	9,500000	7
7,500000	0,500000	0,360000	0,071000	102,000000	0,997800	0,800000	10,500000	5
6,700000	0,580000	0,080000	0,097000	65,000000	0,995900	0,540000	9,200000	5
7,500000	0,500000	0,360000	0,071000	102,000000	0,997800	0,800000	10,500000	5
5,600000	0,615000	0,000000	0,089000	59,000000	0,994300	0,520000	9,900000	5
7,800000	0,610000	0,290000	0,114000	29,000000	0,997400	1,560000	9,100000	5
8,900000	0,620000	0,180000	0,176000	145,000000	0,998600	0,880000	9,200000	5
8,900000	0,620000	0,190000	0,170000	148,000000	0,998600	0,930000	9,200000	5
8,500000	0,280000	0,560000	0,092000	103,000000	0,996900	0,750000	10,500000	7
8,100000	0,560000	0,280000	0,368000	56,000000	0,996800	1,280000	9,300000	5
7,400000	0,590000	0,080000	0,086000	29,000000	0,997400	0,500000	9,000000	4
7,900000	0,320000	0,510000	0,341000	56,000000	0,996900	1,080000	9,200000	6

Predict wine quality: first approach¶

In [6]:

// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [7]:

val (train, test) = df.convert { colsOf<Double>() }.toFloat()
    .toOnHeapDataset(labelColumnName = "quality")
    .split(0.8)

Define simple neural network with only 2 dense layers

In [8]:

val inputNeurons = train.x[0].size.toLong()

val model = Sequential.of(
    Input(
        inputNeurons,
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    )
)

java.lang.UnsatisfiedLinkError: Cannot find TensorFlow native library for OS: darwin, architecture: aarch64. See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md for possible solutions (such as building the library from source). Additional information on attempts to find the native library can be obtained by adding org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM.
	at org.tensorflow.NativeLibrary.load(NativeLibrary.java:77)
	at org.tensorflow.TensorFlow.init(TensorFlow.java:67)
	at org.tensorflow.TensorFlow.<clinit>(TensorFlow.java:82)
	at org.tensorflow.Graph.<clinit>(Graph.java:479)
	at org.jetbrains.kotlinx.dl.api.core.GraphTrainableModel.<init>(GraphTrainableModel.kt:113)
	at org.jetbrains.kotlinx.dl.api.core.Sequential.<init>(Sequential.kt:26)
	at org.jetbrains.kotlinx.dl.api.core.Sequential$Companion.of(Sequential.kt:45)
	at org.jetbrains.kotlinx.dl.api.core.Sequential$Companion.of$default(Sequential.kt:39)
	at Line_25_jupyter.<init>(Line_25.jupyter.kts:3) at Cell In[8], line 3
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:122)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:48)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt)
	at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:137)
	at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
	at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:104)
	at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:277)
	at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:95)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:69)
	at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:48)
	at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:137)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:80)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:78)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:774)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute-L4Nmkdk(CellExecutorImpl.kt:78)
	at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute-L4Nmkdk$default(CellExecutor.kt:13)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode-wNURfNM(ReplForJupyterImpl.kt:596)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalExImpl(ReplForJupyterImpl.kt:454)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evalExImpl(ReplForJupyterImpl.kt:141)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:447)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:446)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:427)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:446)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:171)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:170)
	at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70)
	at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedIn(IdeCompatibleMessageRequestProcessor.kt:347)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedIn(IdeCompatibleMessageRequestProcessor.kt:67)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:361)
	at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70)
	at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedErr(IdeCompatibleMessageRequestProcessor.kt:336)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedErr(IdeCompatibleMessageRequestProcessor.kt:67)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1.invoke(IdeCompatibleMessageRequestProcessor.kt:360)
	at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70)
	at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedOut(IdeCompatibleMessageRequestProcessor.kt:328)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.evalWithIO(IdeCompatibleMessageRequestProcessor.kt:359)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:170)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:169)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$Task.execute(JupyterExecutorImpl.kt:41)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:81)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:79)
	at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30)

java.lang.UnsatisfiedLinkError: Cannot find TensorFlow native library for OS: darwin, architecture: aarch64. See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md for possible solutions (such as building the library from source). Additional information on attempts to find the native library can be obtained by adding org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM.
at Cell In[8], line 3

In [ ]:

model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)

In [ ]:

model.printSummary()

Train it!

In [ ]:

val trainHist = model.fit(train, batchSize = 500, epochs=2000)

In [ ]:

trainHist.epochHistory.toDataFrame().tail()

Let's check that our network predicts values more or less correctly:

In [ ]:

model.predictSoftly(test.x[9])[0]

In [ ]:

test.y[9]

Close the model:

In [ ]:

model.close()

Predict wine quality: second approach¶

In [ ]:

data class TrainTestSplitResult<T>(
    val trainX: DataFrame<T>,
    val trainY: DataFrame<T>,
    val testX: DataFrame<T>,
    val testY: DataFrame<T>,
)

fun <T> trainTestSplit(
    d: DataFrame<T>,
    col: String,
    trainPart: Double,
): TrainTestSplitResult<T> {
    val n = d.count()
    val trainN = ceil(n * trainPart).toInt()

    val shuffledInd = (0 until n).shuffled()
    val trainInd = shuffledInd.subList(0, trainN)
    val testInd = shuffledInd.subList(trainN, n)

    val train = d[trainInd]
    val test = d[testInd]

    val trainX = train.select { all().except(cols(col)) }
    val trainY = train.select(col)

    val testX = test.select { all().except(cols(col)) }
    val testY = test.select(col)

    return TrainTestSplitResult(trainX, trainY, testX, testY)
}

Let's create and then train the model as we did before

In [ ]:

val (trainX, trainY, testX, testY) =
    trainTestSplit(df, "quality", 0.8)

In [ ]:

fun <T> DataFrame<T>.toX(): Array<FloatArray> =
    merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
        .get { "X"<FloatArray>() }
        .toList()
        .toTypedArray()

In [ ]:

fun <T> DataFrame<T>.toY(): FloatArray = 
    get { "quality"<Int>() }
        .asIterable()
        .map { it.toFloat() }
        .toFloatArray()

In [ ]:

val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()

In [ ]:

val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })

In [ ]:

val inputNeurons = train.x[0].size.toLong()

val model2 = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()

In [ ]:

val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()

In [ ]:

model2.predictSoftly(testXDL[9])[0]

In [ ]:

testYDL[9]

We can also compare predicted and ground truth values to ensure predictions are correct

In [ ]:

val predicted = testXDL.mapIndexed { i, _ ->
    round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")

val ground_truth = testYDL.mapIndexed { i, _ ->
    testYDL[i].toInt()
}.toColumn("ground_truth")

val predDf = dataFrameOf(predicted, ground_truth)

In [ ]:

predDf.head()

In [ ]:

val inds = List(10) { it + 1 }
val ctab = predDf
    .groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
    .sortBy { ground_truth }

ctab.format { drop(1) }.perRowCol { row, col ->
    val y = col.name().toInt()
    val x = row.ground_truth
    val k = 1.0 - abs(x - y) / 10.0
    background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}

In [ ]:

val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }

In [ ]:

predDf2.avg_dev.cast<Double>().describe()

In [ ]:

predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]

In [ ]:

model2.close()