%useLatestDescriptors
%use dataframe, lets-plot

var df = DataFrame.readCSV(
    fileOrUrl = "../../idea-examples/titanic/src/main/resources/titanic.csv",
    delimiter = ';',
    parserOptions = ParserOptions(locale = java.util.Locale.FRENCH),
)

df.head()

df.describe()

df

val df1 = df
    // imputing
    .fillNulls { sibsp and parch and age and fare }.perCol { mean() }
    .fillNulls { sex }.with { "female" }
    .fillNulls { embarked }.with { "S" }
    .convert { sibsp and parch and age and fare }.toDouble()

df1.head()

df1.schema()

df1.corr()

val correlations = df1
    .corr { all() }.with { survived }
    .sortBy { survived }
correlations

df1.groupBy { pclass }.mean { survived }.sortBy { pclass }

df1.groupBy { sex }.mean { survived }.sortBy { survived }

df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp }

df1.groupBy { parch }.mean { survived }.sortBy { parch }

val byAge = df1.valueCounts { age }.sortBy { age }
byAge

// JetBrains color palette
val colors = mapOf(
    "light_orange" to "#ffb59e",
    "orange" to "#ff6632",
    "light_grey" to "#a6a6a6",
    "dark_grey" to "#4c4c4c",
)

letsPlot(byAge.toMap()) { x = "age"; y = "count" } +
        geomPoint(size = 5, color = colors["dark_grey"]) +
        ggsize(width = 850, height = 500)

val age = df.select { age }.dropNulls().sortBy { age }

letsPlot(age.toMap()) { x = "age" } + 
        geomHistogram(binWidth = 5, fill = colors["orange"]) + 
        ggsize(width = 850, height = 500)

df1.groupBy { age }.pivotCounts { survived }.sortBy { age }

val survivedByAge = df1.select { survived and age }.sortBy { age }
survivedByAge

val plot = letsPlot(
    survivedByAge
        .convert { survived }.with { if (it == 1) "Survived" else "Died" }
        .toMap()
)

plot +
        geomHistogram(
            binWidth = 5,
            alpha = 0.7,
            position = positionDodge(),
        ) { x = "age"; fill = "survived" } +
        scaleFillManual(
            values = listOf(colors["dark_grey"]!!, colors["orange"]!!),
        ) +
        ggsize(width = 850, height = 500)

// Density plot
plot +
        geomDensity { x = "age"; color = "survived" } +
        scaleColorManual(values = listOf(colors["dark_grey"]!!, colors["orange"]!!)) +
        ggsize(width = 850, height = 250)

// A basic box plot
plot +
        geomBoxplot { x = "survived"; y = "age"; fill = "survived" } +
        scaleFillManual(values = listOf(colors["dark_grey"]!!, colors["orange"]!!)) +
        ggsize(width = 500, height = 400)

val pivoted = df1.pivotMatches { pclass and sex and embarked }
pivoted.head()

val df2 = pivoted
    // feature extraction
    .select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) }
    .convert { cols { it.isValueColumn() } /* TODO: change to valueCols() after Selection DSL overhaul */ }.toDouble()

df2.head()

val titanicData = df2.flatten().toMap()

val plots = listOf(
    CorrPlot(titanicData, "Tiles")
        .tiles()
        .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!)
        .build(),
    CorrPlot(titanicData, "Points")
        .points()
        .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!)
        .build(),
    CorrPlot(titanicData, "Tiles and labels")
        .tiles()
        .labels()
        .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!)
        .build(),
    CorrPlot(titanicData, "Tiles, points and labels").points().labels()
        .tiles()
        .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!)
        .build()
)
val widths = listOf(700)
val heights = plots.map { 600 }

gggrid(plots = plots, ncol = 1, widths = widths, heights = heights)

val familyDF = df1.add("familyNumber") { sibsp + parch }

familyDF.head()

familyDF.corr { familyNumber }.with { survived }

familyDF.corr { familyNumber }.with { age }

val titledDF = df
    .select { survived and name }
    .add("title") {
        name.split(".")[0].split(",")[1].trim()
    }
titledDF.head(100)

titledDF.valueCounts { title }

val rareTitles = listOf(
    "Dona", "Lady", "the Countess", "Capt", "Col", "Don",
    "Dr", "Major", "Rev", "Sir", "Jonkheer",
)

val cleanedTitledDF = titledDF.update { title }.with {
    when {
        it == "Mlle" -> "Miss"
        it == "Ms" -> "Miss"
        it == "Mme" -> "Mrs"
        it in rareTitles -> "Rare Title"
        else -> it
    }
}

cleanedTitledDF.valueCounts { title }

val correlations = cleanedTitledDF
    .pivotMatches { title }
    .corr { title }.with { survived }
correlations

correlations
    .update { title }.with { it.substringAfter('_') }
    .filter { title != "survived" }

val groupedCleanedTitledDF = cleanedTitledDF
    .valueCounts { title and survived }
    .sortBy { title and survived }
groupedCleanedTitledDF

val surnameDF = df1
    .select { survived and name }
    .add("surname") {
        name.split(".")[0].split(",")[0].trim()
    }
surnameDF.head()

surnameDF.valueCounts { surname }

surnameDF.surname.countDistinct()

val firstSymbol by column<String>()

df1
    .add(firstSymbol) {
        name.split(".")[0].split(",")[0].trim().first().toString()
    }
    .pivotMatches(firstSymbol)
    .corr { firstSymbol }.with { survived }