%use dataframe, ktor-client // to see autogenerated code, uncomment the line below: //%trackExecution generated val url = URL("https://gist.githubusercontent.com/cmelchior/0a9f9d95bb3de7a2304f81f2861ad62b/raw/735123034ea54c4a8d2554fff719e70b34287923/titanic.csv") // Convert data to dataframe and rename the column names to more idiomatic Kotlin names. // This also generate a marker interface for the typed data frame wrapper and extension properties for it. val df = DataFrame.readCSV(url).renameToCamelCase() df // get typed column as extension property df.name // select single column -> returns DataFrame df.select { name } // select several columns df.select { name and age and embarked } // another way to select columns without compile-time check df.select(df.name, df.age, df.sex) // select columns filtered by predicate df.select { colsOf() } // Row indexing df[1] // Column indexing df.name[0] // Same result df[0].name // filter rows by predicate. Predicate receiver is of type TypedDataFrameRow<*> with generated extension properties df.filter { sex == "female" } df.filter { age > 50 } // compilation error, because 'age' is a nullable property // filter rows where 'age' is not null. val withAges = df.dropNA { age } withAges // now filtration works withAges.filter { age > 50 } // find the oldest survived woman withAges.filter { survived == 1 && sex == "female" }.maxBy{ age } // sort by single column withAges.sortByDesc { age } // sort by several columns withAges.sortBy { age and name } // another way withAges.sortBy(withAges.age, withAges.name) // add new column and store result in a new field val withYear = withAges.add("year") { 1912 - age } withYear // check new column withYear.year // add several columns withAges.add { "year" from { 1912 - age } "died" from { survived == 0 } } // another way to build new column via column arithmetics val birthYear = withAges.age * (-1) + 1912 // new column can be added to dataframe with '+' operator withAges + birthYear.rename("year") // Iterable of columns can also be added with '+' withAges + withAges.columns().map { it.rename(it.name + " duplicate") } // remove single column df.remove { ticket } // remove several columns df.remove { pclass and ticket and cabin and survived } // remove several columns by column instances df.remove(df.passengerId, df.pclass) // '-' operator can also be used for removing columns df - { passengerId } - { pclass } - { cabin } // group by single column df.groupBy { embarked }.count() // group by several columns df.groupBy{ sex and survived }.count() // another way df.groupBy(df.sex, df.survived).count() // Various summarization operations on grouped data frame withAges.groupBy { embarked }.aggregate{ // Methods are invoked on every group as like calling a single DataFrame. count() into "total count" (count { survived == 1 }.toDouble() / count() * 100.0) into "survival rate" age.mean() into "average age" // Column operations are also supported age.median() into "median age" val youngest = minBy { age } youngest.name into "youngest" youngest.age into "youngest age" val oldest = maxBy { age } oldest.name into "oldest" oldest.age into "oldest age" } df.size() withAges.count { age > 50 } withAges.sortBy(){ age }.take(5) withAges.sortBy{ age }.takeLast(5) // 'rows' field is Iterable> so it can be used in any stdlib extensions for Iterable df.rows().map { it.name }.take(5) // Sample List data class Item(val first: Int, val second: Double) val itemsList = listOf(Item(1,2.0), Item(2, 3.0), Item(3, 4.0)) // List -> DataFrame by reflection itemsList.toDataFrame() // List -> DataFrame by mappings itemsList.toDataFrame { "a" from { it.first } "b" from { it.second } "c" from { it.first * it.second } } // Convert data frame to a list of data class items @DataSchema data class Person(val name: String, val age: Double?) val passengers = df.toListOf() // Check type of the element passengers[0].javaClass // Do any list operations passengers.maxBy { it.age ?: .0 } // Create marker interface to write column-specific extensions for dataframes @DataSchema interface SimplePerson { val name: String val age: Double } // Create extension for any data frame with fields 'name' and 'age' fun DataFrame.getOlderThan(minAge: Double) = filter { age > minAge } // The @DataSchema and extension function is automatically applied to any dataframe // created after that match the column names and types val updatedWithAges = withAges // The dataframe is now considered a subtype of `SimplePerson` and can access extension functions. updatedWithAges.getOlderThan(50.0) import org.jetbrains.kotlinx.dataframe.codeGen.generateInterfaces // code for marker interface can be auto-generated withAges.select{ name and age and sex and ticket }.generateInterfaces("Person") // Copy-paste the interface and run it @DataSchema interface Person { val age: kotlin.Double val name: kotlin.String val sex: kotlin.String val ticket: kotlin.String } // Now interface 'Person' is available, so we can write an extension method, // that will work for any data frame with these four columns fun DataFrame.addSummary() = add("summary") {"$sex $name $age has ticket $ticket"} // for example, it works for 'withAges' data frame, but only after the cell has been evaluated. val dfWithSummary = withAges dfWitSummary.addSummary() // When data frame variable is mutable, a strongly typed wrapper for it // is generated only once after the first execution of a cell where it is declared var nameAndSex = df.select(df.name, df.sex) nameAndSex // let's declare immutable variable, that contains all string columns val strings = df.select { colsOf() } strings // 'nameAndSex' is assignable from 'strings', // because 'strings' has all the columns that are required by type of 'nameAndSex' nameAndSex = strings // note, that the actual value of 'nameAndSex' is still a data frame of all string columns nameAndSex // but typed access to the fields works only for 'name' and 'sex' nameAndSex.sex // this is OK // this fails with compilation error nameAndSex.ticket nameAndSex["ticket"] // the requested column is still available by column name string // now let's create a variable with two other columns val nameAndTicket = df.select(df.name, df.ticket) nameAndTicket nameAndSex = nameAndHome // this assignment doesn't work because of columns mismatch // unfortunately, there is a way to get a runtime error here, // because typed wrappers are generated only after execution of a cell // so the following assigment will pass fine, because return type of 'select' is the same as in 'df' variable, // although the set of columns was reduced nameAndSex = df.select(df.name, df.ticket) // if we try to access the column, we get runtime error nameAndSex.sex