Read the dataframe from CSV and print the first few lines of it

In [2]:

val raw_df = DataFrame.readCSV(fileOrUrl = "winequality-red.csv", delimiter = ';')
raw_df.head()

Out[2]:

DataFrame: rowsCount = 5, columnsCount = 12

In [3]:

raw_df.corr().format { colsOf<Double>() }.with { linearBg(it, -1.0 to red, 1.0 to green) }

Out[3]:

DataFrame: rowsCount = 12, columnsCount = 13

Based on the correlation, we can remove some columns, they seem to be insignificant

In [4]:

val df = raw_df.remove {`free sulfur dioxide` and `residual sugar` and pH }

Predict wine quality: first approach¶

In [5]:

// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [6]:

val (train, test) = df.convert { colsOf<Double>() }.toFloat()
        .toOnHeapDataset(labelColumnName = "quality")
        .split(0.8)

Define simple neural network with only 2 dense layers

In [7]:

val inputNeurons = train.x[0].size.toLong()

val model = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)

In [8]:

model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)

In [9]:

model.printSummary()

==============================================================================
Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
==============================================================================
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
==============================================================================
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
==============================================================================

Train it!

In [10]:

val trainHist = model.fit(train, batchSize = 500, epochs=2000)

In [11]:

trainHist.epochHistory.toDataFrame().tail()

Out[11]:

DataFrame: rowsCount = 5, columnsCount = 5

Let's check that our network predicts values more or less correctly:

In [12]:

model.predictSoftly(test.x[9])[0]

Out[12]:

5.24826

In [13]:

test.y[9]

Out[13]:

5.0

Close the model:

In [14]:

model.close()

Predict wine quality: second approach¶

In [15]:

fun <T> trainTestSplit(d: DataFrame<T>, col: String, trainPart: Double): Pair<Pair<DataFrame<T>, DataFrame<T>>, Pair<DataFrame<T>, DataFrame<T>>> {
    val n = d.count()
    val trainN = ceil(n * trainPart).toInt()

    val shuffledInd = (0 until n).shuffled()
    val trainInd = shuffledInd.subList(0, trainN)
    val testInd = shuffledInd.subList(trainN, n)
    
    val train = d[trainInd]
    val test = d[testInd]
    
    val trainX = train.select { all().except(cols(col)) }
    val trainY = train.select(col)
    
    val testX = test.select { all().except(cols(col)) }
    val testY = test.select(col)
    
    return (trainX to trainY) to (testX to testY)
}

Let's create and then train the model as we did before

In [16]:

val (trainPair, testPair) = trainTestSplit(df, "quality", 0.8)
val (trainX, trainY) = trainPair
val (testX, testY) = testPair

In [17]:

fun <T> DataFrame<T>.toX(): Array<FloatArray> = 
        merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
        .get { "X"<FloatArray>() }.toList().toTypedArray()

In [18]:

fun <T> DataFrame<T>.toY() = get { "quality"<Int>() }.asIterable().map { it.toFloat() }.toFloatArray()

In [19]:

val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()

In [20]:

val trainKotlinDLDataset = OnHeapDataset.create({trainXDL}, {trainYDL})
val testKotlinDLDataset = OnHeapDataset.create({testXDL}, {testYDL})

In [21]:

val inputNeurons = train.x[0].size.toLong()

val model2 = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()

==============================================================================
Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
==============================================================================
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
==============================================================================
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
==============================================================================

In [22]:

val trainHist = model2.fit(train, batchSize = 500, epochs=2000)
trainHist.epochHistory.toDataFrame().tail()

Out[22]:

DataFrame: rowsCount = 5, columnsCount = 5

In [23]:

model2.predictSoftly(testXDL[9])[0]

Out[23]:

5.9178224

In [24]:

testYDL[9]

Out[24]:

6.0

We can also compare predicted and ground truth values to ensure predictions are correct

In [25]:

val predicted = testXDL.mapIndexed  { i, _ -> 
        round(model2.predictSoftly(testXDL[i])[0]).toInt()
    }.toColumn("predicted")

val ground_truth = testYDL.mapIndexed  { i, _ -> 
       testYDL[i].toInt()
    }.toColumn("ground_truth")

val predDf = dataFrameOf(predicted, ground_truth)

In [26]:

predDf.head()

Out[26]:

DataFrame: rowsCount = 5, columnsCount = 2

In [27]:

val inds = List(10){it + 1}
val ctab = predDf.groupBy { ground_truth }.pivotCounts(inward = false) { predicted }.sortBy { ground_truth }

ctab.format { drop(1) }.perRowCol { row, col ->
    val y = col.name().toInt()
    val x = row.ground_truth
    val k = 1.0 - abs(x - y)/10.0
    background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}

Out[27]:

DataFrame: rowsCount = 6, columnsCount = 5

In [28]:

val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }

In [29]:

predDf2.avg_dev.cast<Double>().describe()

Out[29]:

DataFrame: rowsCount = 1, columnsCount = 12

In [30]:

predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]

Out[30]:

DataRow: index = 222, columnsCount = 3

In [31]:

model2.close()

Predict wine quality¶

Predict wine quality: first approach¶

Predict wine quality: second approach¶