%useLatestDescriptors %use dataframe, lets-plot var df = DataFrame.readCSV( fileOrUrl = "../../idea-examples/titanic/src/main/resources/titanic.csv", delimiter = ';', parserOptions = ParserOptions(locale = java.util.Locale.FRENCH), ) df.head() df.describe() df val df1 = df // imputing .fillNulls { sibsp and parch and age and fare }.perCol { mean() } .fillNulls { sex }.with { "female" } .fillNulls { embarked }.with { "S" } .convert { sibsp and parch and age and fare }.toDouble() df1.head() df1.schema() df1.corr() val correlations = df1 .corr { all() }.with { survived } .sortBy { survived } correlations df1.groupBy { pclass }.mean { survived }.sortBy { pclass } df1.groupBy { sex }.mean { survived }.sortBy { survived } df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp } df1.groupBy { parch }.mean { survived }.sortBy { parch } val byAge = df1.valueCounts { age }.sortBy { age } byAge // JetBrains color palette val colors = mapOf( "light_orange" to "#ffb59e", "orange" to "#ff6632", "light_grey" to "#a6a6a6", "dark_grey" to "#4c4c4c", ) letsPlot(byAge.toMap()) { x = "age"; y = "count" } + geomPoint(size = 5, color = colors["dark_grey"]) + ggsize(width = 850, height = 500) val age = df.select { age }.dropNulls().sortBy { age } letsPlot(age.toMap()) { x = "age" } + geomHistogram(binWidth = 5, fill = colors["orange"]) + ggsize(width = 850, height = 500) df1.groupBy { age }.pivotCounts { survived }.sortBy { age } val survivedByAge = df1.select { survived and age }.sortBy { age } survivedByAge val plot = letsPlot( survivedByAge .convert { survived }.with { if (it == 1) "Survived" else "Died" } .toMap() ) plot + geomHistogram( binWidth = 5, alpha = 0.7, position = positionDodge(), ) { x = "age"; fill = "survived" } + scaleFillManual( values = listOf(colors["dark_grey"]!!, colors["orange"]!!), ) + ggsize(width = 850, height = 500) // Density plot plot + geomDensity { x = "age"; color = "survived" } + scaleColorManual(values = listOf(colors["dark_grey"]!!, colors["orange"]!!)) + ggsize(width = 850, height = 250) // A basic box plot plot + geomBoxplot { x = "survived"; y = "age"; fill = "survived" } + scaleFillManual(values = listOf(colors["dark_grey"]!!, colors["orange"]!!)) + ggsize(width = 500, height = 400) val pivoted = df1.pivotMatches { pclass and sex and embarked } pivoted.head() val df2 = pivoted // feature extraction .select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) } .convert { cols { it.isValueColumn() } /* TODO: change to valueCols() after Selection DSL overhaul */ }.toDouble() df2.head() val titanicData = df2.flatten().toMap() val plots = listOf( CorrPlot(titanicData, "Tiles") .tiles() .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!) .build(), CorrPlot(titanicData, "Points") .points() .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!) .build(), CorrPlot(titanicData, "Tiles and labels") .tiles() .labels() .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!) .build(), CorrPlot(titanicData, "Tiles, points and labels").points().labels() .tiles() .paletteGradient(colors["orange"]!!, colors["light_grey"]!!, colors["dark_grey"]!!) .build() ) val widths = listOf(700) val heights = plots.map { 600 } gggrid(plots = plots, ncol = 1, widths = widths, heights = heights) val familyDF = df1.add("familyNumber") { sibsp + parch } familyDF.head() familyDF.corr { familyNumber }.with { survived } familyDF.corr { familyNumber }.with { age } val titledDF = df .select { survived and name } .add("title") { name.split(".")[0].split(",")[1].trim() } titledDF.head(100) titledDF.valueCounts { title } val rareTitles = listOf( "Dona", "Lady", "the Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", ) val cleanedTitledDF = titledDF.update { title }.with { when { it == "Mlle" -> "Miss" it == "Ms" -> "Miss" it == "Mme" -> "Mrs" it in rareTitles -> "Rare Title" else -> it } } cleanedTitledDF.valueCounts { title } val correlations = cleanedTitledDF .pivotMatches { title } .corr { title }.with { survived } correlations correlations .update { title }.with { it.substringAfter('_') } .filter { title != "survived" } val groupedCleanedTitledDF = cleanedTitledDF .valueCounts { title and survived } .sortBy { title and survived } groupedCleanedTitledDF val surnameDF = df1 .select { survived and name } .add("surname") { name.split(".")[0].split(",")[0].trim() } surnameDF.head() surnameDF.valueCounts { surname } surnameDF.surname.countDistinct() val firstSymbol by column() df1 .add(firstSymbol) { name.split(".")[0].split(",")[0].trim().first().toString() } .pivotMatches(firstSymbol) .corr { firstSymbol }.with { survived }