%use dataframe

%useLatestDescriptors
%use kandy

val df = DataFrame.readCsv(
    fileOrUrl = "titanic.csv",
    delimiter = ';',
    parserOptions = ParserOptions(locale = java.util.Locale.FRENCH),
)

df.head()

df.describe()

df

val df1 = df
    // imputing
    .fillNulls { sibsp and parch and age and fare }.perCol { mean() }
    .fillNulls { sex }.with { "other" }
    .fillNulls { embarked }.with { "S" }
    .convert { sibsp and parch and age and fare }.toDouble()

df1.head()

df1.schema()

df1.corr()

val correlations = df1
    .corr { all() }.with { survived }
    .sortBy { survived }
correlations

df1.groupBy { pclass }.mean { survived }.sortBy { pclass }

df1.groupBy { sex }.mean { survived }.sortBy { survived }

df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp }

df1.groupBy { parch }.mean { survived }.sortBy { parch }

val byAge = df1.valueCounts { age }.sortBy { age }
byAge

// JetBrains color palette
object JetBrainsColors {
    val lightOrange = Color.hex("#ffb59e")
    val orange = Color.hex("#ff6632")
    val lightGrey = Color.hex("#a6a6a6")
    val darkGrey = Color.hex("#4c4c4c")
}

byAge.plot { 
    points {
        x(age)
        y(count)
        size = 5.0
        color = JetBrainsColors.lightGrey
    }
    layout { 
        size = 850 to 500
    }
}

val age = df.select { age }.dropNulls().sortBy { age }

age.plot {
    histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
        fillColor = JetBrainsColors.orange
    }
    layout { 
        size = 850 to 500
    }
}

df1.groupBy { age }.pivotCounts { survived }.sortBy { age }

val survivedByAge = df1
    .select { survived and age }
    .sortBy { age }
    .convert { survived }.with { if (it == 1) "Survived" else "Died" }

survivedByAge

survivedByAge.groupBy { survived }

survivedByAge.groupBy { survived }.plot {
    histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
        fillColor(key.survived) {
            scale = categorical(
                "Survived" to JetBrainsColors.orange,
                "Died" to JetBrainsColors.darkGrey,
            )
        }
        alpha = 0.7
        position = Position.dodge()
    }
    layout {
        size = 850 to 500
    }
}

// Density plot
survivedByAge.groupBy { survived }.plot {
    densityPlot(x = age) {
        fillColor = Color.GREY
        alpha = 0.3
        borderLine {
            color(key.survived) {
                scale = categorical(
                    "Survived" to JetBrainsColors.orange,
                    "Died" to JetBrainsColors.darkGrey,
                )
            }
        }
    }
    layout {
        size = 850 to 250
    }
}

survivedByAge.groupBy { survived }

// A basic box plot
survivedByAge.plot {
    boxplot(x = survived, y = age) {
        boxes {
            fillColor(Stat.x) {
                scale = categorical(
                    "Survived" to JetBrainsColors.orange,
                    "Died" to JetBrainsColors.darkGrey,
                )
            }
        }
    }
    layout {
        size = 500 to 400
    }
}

val pivoted = df1.pivotMatches { pclass and sex and embarked }
pivoted.head()

val df2 = pivoted
    // feature extraction
    .select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) }
    .convert { valueCols() }.toDouble()

df2.head()

df2.corr { survived and sibsp and parch and age and fare }.withItself()

val correlationTable = df2
    .corr { survived and sibsp and parch and age and fare }.withItself()
    .gather { allAfter("column") }.into("row", "value")
correlationTable

fun scaleContinuousColorGradientN() = continuousColorGradientN(
    gradientColors = listOf(
        JetBrainsColors.orange,
        JetBrainsColors.lightGrey,
        JetBrainsColors.darkGrey,
    ), 
    domainMin = -1.0, 
    domainMax = 1.0,
)

correlationTable.plot {
    tiles {
        x(row) { axis.name = "" }
        y(column) { axis.name = "" }
        fillColor(value) { scale = scaleContinuousColorGradientN() }
    }
}

correlationTable.plot {
    points {
        size(value) {
            legend {
                breaks(emptyList())
            }
        }
        symbol = Symbol.SQUARE
        x(row) {
            axis.name = ""
        }
        y(column) {
            axis.name = ""
        }
        color(value) { scale = scaleContinuousColorGradientN() }
    }
    layout {
        style {
            panel.grid {
                majorLine {
                    blank = true
                }
            }
        }
        size = 500 to 350
    }
}

val familyDF = df1
    .add("familyNumber") { sibsp + parch }

familyDF.head()

familyDF.corr { familyNumber }.with { survived }

familyDF.corr { familyNumber }.with { age }

val titledDF = df
    .select { survived and name }
    .add("title") {
        name.split(".")[0].split(",")[1].trim()
    }
titledDF.head(100)

titledDF.valueCounts { title }

val rareTitles = listOf(
    "Dona", "Lady", "the Countess", "Capt", "Col", "Don",
    "Dr", "Major", "Rev", "Sir", "Jonkheer",
)

val cleanedTitledDF = titledDF.update { title }.with {
    when {
        it == "Mlle" -> "Miss"
        it == "Ms" -> "Miss"
        it == "Mme" -> "Mrs"
        it in rareTitles -> "Rare Title"
        else -> it
    }
}

cleanedTitledDF.valueCounts { title }

val correlations = cleanedTitledDF
    .pivotMatches { title }
    .corr { title }.with { survived }
correlations

correlations
    .update { title }.with { it.substringAfter('_') }
    .filter { title != "survived" }

val groupedCleanedTitledDF = cleanedTitledDF
    .valueCounts { title and survived }
    .sortBy { title and survived }
groupedCleanedTitledDF

val surnameDF = df1
    .select { survived and name }
    .add("surname") {
        name.split(".")[0].split(",")[0].trim()
    }
surnameDF.head()

surnameDF.valueCounts { surname }

surnameDF.surname.countDistinct()

val firstSymbol by column<String>()

df1
    .add(firstSymbol) {
        name.split(".")[0].split(",")[0].trim().first().toString()
    }
    .pivotMatches(firstSymbol)
    .corr { firstSymbol }.with { survived }