Use the latest versions of DataFrame and KotlinDL libraries from version repository.
%use dataframe
%use kotlin-dl
Read the dataframe from CSV and print the first few lines of it
val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|---|---|
7,400000 | 0,700000 | 0,000000 | 1,900000 | 0,076000 | 11,000000 | 34,000000 | 0,997800 | 3,510000 | 0,560000 | 9,400000 | 5 |
7,800000 | 0,880000 | 0,000000 | 2,600000 | 0,098000 | 25,000000 | 67,000000 | 0,996800 | 3,200000 | 0,680000 | 9,800000 | 5 |
7,800000 | 0,760000 | 0,040000 | 2,300000 | 0,092000 | 15,000000 | 54,000000 | 0,997000 | 3,260000 | 0,650000 | 9,800000 | 5 |
11,200000 | 0,280000 | 0,560000 | 1,900000 | 0,075000 | 17,000000 | 60,000000 | 0,998000 | 3,160000 | 0,580000 | 9,800000 | 6 |
7,400000 | 0,700000 | 0,000000 | 1,900000 | 0,076000 | 11,000000 | 34,000000 | 0,997800 | 3,510000 | 0,560000 | 9,400000 | 5 |
Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.
rawDf.corr().format { colsOf<Double>() }.with {
linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}
column | fixed acidity | volatile acidity | residual sugar | chlorides | density | pH | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|---|
fixed acidity | 1,000000 | -0,256131 | 0,114777 | 0,093705 | 0,668047 | -0,682978 | 0,183006 | -0,061668 | 0,124052 |
volatile acidity | -0,256131 | 1,000000 | 0,001918 | 0,061298 | 0,022026 | 0,234937 | -0,260987 | -0,202288 | -0,390558 |
residual sugar | 0,114777 | 0,001918 | 1,000000 | 0,055610 | 0,355283 | -0,085652 | 0,005527 | 0,042075 | 0,013732 |
chlorides | 0,093705 | 0,061298 | 0,055610 | 1,000000 | 0,200632 | -0,265026 | 0,371260 | -0,221141 | -0,128907 |
density | 0,668047 | 0,022026 | 0,355283 | 0,200632 | 1,000000 | -0,341699 | 0,148506 | -0,496180 | -0,174919 |
pH | -0,682978 | 0,234937 | -0,085652 | -0,265026 | -0,341699 | 1,000000 | -0,196648 | 0,205633 | -0,057731 |
sulphates | 0,183006 | -0,260987 | 0,005527 | 0,371260 | 0,148506 | -0,196648 | 1,000000 | 0,093595 | 0,251397 |
alcohol | -0,061668 | -0,202288 | 0,042075 | -0,221141 | -0,496180 | 0,205633 | 0,093595 | 1,000000 | 0,476166 |
quality | 0,124052 | -0,390558 | 0,013732 | -0,128907 | -0,174919 | -0,057731 | 0,251397 | 0,476166 | 1,000000 |
Based on the correlation, we can remove some columns, they seem to be insignificant
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df
fixed acidity | volatile acidity | citric acid | chlorides | total sulfur dioxide | density | sulphates | alcohol | quality |
---|---|---|---|---|---|---|---|---|
7,400000 | 0,700000 | 0,000000 | 0,076000 | 34,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,800000 | 0,880000 | 0,000000 | 0,098000 | 67,000000 | 0,996800 | 0,680000 | 9,800000 | 5 |
7,800000 | 0,760000 | 0,040000 | 0,092000 | 54,000000 | 0,997000 | 0,650000 | 9,800000 | 5 |
11,200000 | 0,280000 | 0,560000 | 0,075000 | 60,000000 | 0,998000 | 0,580000 | 9,800000 | 6 |
7,400000 | 0,700000 | 0,000000 | 0,076000 | 34,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,400000 | 0,660000 | 0,000000 | 0,075000 | 40,000000 | 0,997800 | 0,560000 | 9,400000 | 5 |
7,900000 | 0,600000 | 0,060000 | 0,069000 | 59,000000 | 0,996400 | 0,460000 | 9,400000 | 5 |
7,300000 | 0,650000 | 0,000000 | 0,065000 | 21,000000 | 0,994600 | 0,470000 | 10,000000 | 7 |
7,800000 | 0,580000 | 0,020000 | 0,073000 | 18,000000 | 0,996800 | 0,570000 | 9,500000 | 7 |
7,500000 | 0,500000 | 0,360000 | 0,071000 | 102,000000 | 0,997800 | 0,800000 | 10,500000 | 5 |
6,700000 | 0,580000 | 0,080000 | 0,097000 | 65,000000 | 0,995900 | 0,540000 | 9,200000 | 5 |
7,500000 | 0,500000 | 0,360000 | 0,071000 | 102,000000 | 0,997800 | 0,800000 | 10,500000 | 5 |
5,600000 | 0,615000 | 0,000000 | 0,089000 | 59,000000 | 0,994300 | 0,520000 | 9,900000 | 5 |
7,800000 | 0,610000 | 0,290000 | 0,114000 | 29,000000 | 0,997400 | 1,560000 | 9,100000 | 5 |
8,900000 | 0,620000 | 0,180000 | 0,176000 | 145,000000 | 0,998600 | 0,880000 | 9,200000 | 5 |
8,900000 | 0,620000 | 0,190000 | 0,170000 | 148,000000 | 0,998600 | 0,930000 | 9,200000 | 5 |
8,500000 | 0,280000 | 0,560000 | 0,092000 | 103,000000 | 0,996900 | 0,750000 | 10,500000 | 7 |
8,100000 | 0,560000 | 0,280000 | 0,368000 | 56,000000 | 0,996800 | 1,280000 | 9,300000 | 5 |
7,400000 | 0,590000 | 0,080000 | 0,086000 | 29,000000 | 0,997400 | 0,500000 | 9,000000 | 4 |
7,900000 | 0,320000 | 0,510000 | 0,341000 | 56,000000 | 0,996900 | 1,080000 | 9,200000 | 6 |
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
return OnHeapDataset.create(
dataframe = this,
yColumn = labelColumnName
)
}
fun OnHeapDataset.Companion.create(
dataframe: DataFrame<Any?>,
yColumn: String
): OnHeapDataset {
fun extractX(): Array<FloatArray> =
dataframe.remove(yColumn).rows()
.map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()
fun extractY(): FloatArray =
dataframe.get { yColumn<Float>() }.toList().toFloatArray()
return create(
::extractX,
::extractY
)
}
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
.toOnHeapDataset(labelColumnName = "quality")
.split(0.8)
Define simple neural network with only 2 dense layers
val inputNeurons = train.x[0].size.toLong()
val model = Sequential.of(
Input(
inputNeurons,
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal(),
)
)
java.lang.UnsatisfiedLinkError: Cannot find TensorFlow native library for OS: darwin, architecture: aarch64. See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md for possible solutions (such as building the library from source). Additional information on attempts to find the native library can be obtained by adding org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM. at org.tensorflow.NativeLibrary.load(NativeLibrary.java:77) at org.tensorflow.TensorFlow.init(TensorFlow.java:67) at org.tensorflow.TensorFlow.<clinit>(TensorFlow.java:82) at org.tensorflow.Graph.<clinit>(Graph.java:479) at org.jetbrains.kotlinx.dl.api.core.GraphTrainableModel.<init>(GraphTrainableModel.kt:113) at org.jetbrains.kotlinx.dl.api.core.Sequential.<init>(Sequential.kt:26) at org.jetbrains.kotlinx.dl.api.core.Sequential$Companion.of(Sequential.kt:45) at org.jetbrains.kotlinx.dl.api.core.Sequential$Companion.of$default(Sequential.kt:39) at Line_25_jupyter.<init>(Line_25.jupyter.kts:3) at Cell In[8], line 3 at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:122) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:48) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt) at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49) at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:137) at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33) at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:104) at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:277) at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:95) at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:69) at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source) at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:48) at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source) at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:137) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:80) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:78) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:774) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute-L4Nmkdk(CellExecutorImpl.kt:78) at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute-L4Nmkdk$default(CellExecutor.kt:13) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode-wNURfNM(ReplForJupyterImpl.kt:596) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalExImpl(ReplForJupyterImpl.kt:454) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evalExImpl(ReplForJupyterImpl.kt:141) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:447) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:446) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:427) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:446) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:171) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:170) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedIn(IdeCompatibleMessageRequestProcessor.kt:347) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedIn(IdeCompatibleMessageRequestProcessor.kt:67) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:361) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedErr(IdeCompatibleMessageRequestProcessor.kt:336) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedErr(IdeCompatibleMessageRequestProcessor.kt:67) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1.invoke(IdeCompatibleMessageRequestProcessor.kt:360) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedOut(IdeCompatibleMessageRequestProcessor.kt:328) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.evalWithIO(IdeCompatibleMessageRequestProcessor.kt:359) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:170) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:169) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$Task.execute(JupyterExecutorImpl.kt:41) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:81) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:79) at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30) java.lang.UnsatisfiedLinkError: Cannot find TensorFlow native library for OS: darwin, architecture: aarch64. See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md for possible solutions (such as building the library from source). Additional information on attempts to find the native library can be obtained by adding org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM. at Cell In[8], line 3
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model.printSummary()
Train it!
val trainHist = model.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()
Let's check that our network predicts values more or less correctly:
model.predictSoftly(test.x[9])[0]
test.y[9]
Close the model:
model.close()
data class TrainTestSplitResult<T>(
val trainX: DataFrame<T>,
val trainY: DataFrame<T>,
val testX: DataFrame<T>,
val testY: DataFrame<T>,
)
fun <T> trainTestSplit(
d: DataFrame<T>,
col: String,
trainPart: Double,
): TrainTestSplitResult<T> {
val n = d.count()
val trainN = ceil(n * trainPart).toInt()
val shuffledInd = (0 until n).shuffled()
val trainInd = shuffledInd.subList(0, trainN)
val testInd = shuffledInd.subList(trainN, n)
val train = d[trainInd]
val test = d[testInd]
val trainX = train.select { all().except(cols(col)) }
val trainY = train.select(col)
val testX = test.select { all().except(cols(col)) }
val testY = test.select(col)
return TrainTestSplitResult(trainX, trainY, testX, testY)
}
Let's create and then train the model as we did before
val (trainX, trainY, testX, testY) =
trainTestSplit(df, "quality", 0.8)
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
.get { "X"<FloatArray>() }
.toList()
.toTypedArray()
fun <T> DataFrame<T>.toY(): FloatArray =
get { "quality"<Int>() }
.asIterable()
.map { it.toFloat() }
.toFloatArray()
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })
val inputNeurons = train.x[0].size.toLong()
val model2 = Sequential.of(
Input(
inputNeurons
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = (inputNeurons * 10).toInt(),
activation = Activations.Tanh,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
),
Dense(
outputSize = 1,
activation = Activations.Linear,
kernelInitializer = HeNormal(),
biasInitializer = HeNormal()
)
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()
model2.predictSoftly(testXDL[9])[0]
testYDL[9]
We can also compare predicted and ground truth values to ensure predictions are correct
val predicted = testXDL.mapIndexed { i, _ ->
round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")
val ground_truth = testYDL.mapIndexed { i, _ ->
testYDL[i].toInt()
}.toColumn("ground_truth")
val predDf = dataFrameOf(predicted, ground_truth)
predDf.head()
val inds = List(10) { it + 1 }
val ctab = predDf
.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
.sortBy { ground_truth }
ctab.format { drop(1) }.perRowCol { row, col ->
val y = col.name().toInt()
val x = row.ground_truth
val k = 1.0 - abs(x - y) / 10.0
background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }
predDf2.avg_dev.cast<Double>().describe()
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]
model2.close()