%use dataframe %useLatestDescriptors %use kandy val df = DataFrame.readCsv( fileOrUrl = "titanic.csv", delimiter = ';', parserOptions = ParserOptions(locale = java.util.Locale.FRENCH), ) df.head() df.describe() df val df1 = df // imputing .fillNulls { sibsp and parch and age and fare }.perCol { mean() } .fillNulls { sex }.with { "other" } .fillNulls { embarked }.with { "S" } .convert { sibsp and parch and age and fare }.toDouble() df1.head() df1.schema() df1.corr() val correlations = df1 .corr { all() }.with { survived } .sortBy { survived } correlations df1.groupBy { pclass }.mean { survived }.sortBy { pclass } df1.groupBy { sex }.mean { survived }.sortBy { survived } df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp } df1.groupBy { parch }.mean { survived }.sortBy { parch } val byAge = df1.valueCounts { age }.sortBy { age } byAge // JetBrains color palette object JetBrainsColors { val lightOrange = Color.hex("#ffb59e") val orange = Color.hex("#ff6632") val lightGrey = Color.hex("#a6a6a6") val darkGrey = Color.hex("#4c4c4c") } byAge.plot { points { x(age) y(count) size = 5.0 color = JetBrainsColors.lightGrey } layout { size = 850 to 500 } } val age = df.select { age }.dropNulls().sortBy { age } age.plot { histogram(x = age, binsOption = BinsOption.byWidth(5.0)) { fillColor = JetBrainsColors.orange } layout { size = 850 to 500 } } df1.groupBy { age }.pivotCounts { survived }.sortBy { age } val survivedByAge = df1 .select { survived and age } .sortBy { age } .convert { survived }.with { if (it == 1) "Survived" else "Died" } survivedByAge survivedByAge.groupBy { survived } survivedByAge.groupBy { survived }.plot { histogram(x = age, binsOption = BinsOption.byWidth(5.0)) { fillColor(key.survived) { scale = categorical( "Survived" to JetBrainsColors.orange, "Died" to JetBrainsColors.darkGrey, ) } alpha = 0.7 position = Position.dodge() } layout { size = 850 to 500 } } // Density plot survivedByAge.groupBy { survived }.plot { densityPlot(x = age) { fillColor = Color.GREY alpha = 0.3 borderLine { color(key.survived) { scale = categorical( "Survived" to JetBrainsColors.orange, "Died" to JetBrainsColors.darkGrey, ) } } } layout { size = 850 to 250 } } survivedByAge.groupBy { survived } // A basic box plot survivedByAge.plot { boxplot(x = survived, y = age) { boxes { fillColor(Stat.x) { scale = categorical( "Survived" to JetBrainsColors.orange, "Died" to JetBrainsColors.darkGrey, ) } } } layout { size = 500 to 400 } } val pivoted = df1.pivotMatches { pclass and sex and embarked } pivoted.head() val df2 = pivoted // feature extraction .select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) } .convert { valueCols() }.toDouble() df2.head() df2.corr { survived and sibsp and parch and age and fare }.withItself() val correlationTable = df2 .corr { survived and sibsp and parch and age and fare }.withItself() .gather { allAfter("column") }.into("row", "value") correlationTable fun scaleContinuousColorGradientN() = continuousColorGradientN( gradientColors = listOf( JetBrainsColors.orange, JetBrainsColors.lightGrey, JetBrainsColors.darkGrey, ), domainMin = -1.0, domainMax = 1.0, ) correlationTable.plot { tiles { x(row) { axis.name = "" } y(column) { axis.name = "" } fillColor(value) { scale = scaleContinuousColorGradientN() } } } correlationTable.plot { points { size(value) { legend { breaks(emptyList()) } } symbol = Symbol.SQUARE x(row) { axis.name = "" } y(column) { axis.name = "" } color(value) { scale = scaleContinuousColorGradientN() } } layout { style { panel.grid { majorLine { blank = true } } } size = 500 to 350 } } val familyDF = df1 .add("familyNumber") { sibsp + parch } familyDF.head() familyDF.corr { familyNumber }.with { survived } familyDF.corr { familyNumber }.with { age } val titledDF = df .select { survived and name } .add("title") { name.split(".")[0].split(",")[1].trim() } titledDF.head(100) titledDF.valueCounts { title } val rareTitles = listOf( "Dona", "Lady", "the Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", ) val cleanedTitledDF = titledDF.update { title }.with { when { it == "Mlle" -> "Miss" it == "Ms" -> "Miss" it == "Mme" -> "Mrs" it in rareTitles -> "Rare Title" else -> it } } cleanedTitledDF.valueCounts { title } val correlations = cleanedTitledDF .pivotMatches { title } .corr { title }.with { survived } correlations correlations .update { title }.with { it.substringAfter('_') } .filter { title != "survived" } val groupedCleanedTitledDF = cleanedTitledDF .valueCounts { title and survived } .sortBy { title and survived } groupedCleanedTitledDF val surnameDF = df1 .select { survived and name } .add("surname") { name.split(".")[0].split(",")[0].trim() } surnameDF.head() surnameDF.valueCounts { surname } surnameDF.surname.countDistinct() val firstSymbol by column() df1 .add(firstSymbol) { name.split(".")[0].split(",")[0].trim().first().toString() } .pivotMatches(firstSymbol) .corr { firstSymbol }.with { survived }