%use dataframe, khttp // to see autogenerated code, uncomment the line below: //%trackExecution -generated val response = khttp.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt") val cleanedText = response.text.replace("\"Molly\"", "Molly").replace("row.names", "row").replace("home.dest", "home") // convert data to dataframe, generate marker interface for typed data frame wrapper and extension properties for it val df = DataFrame.readDelim(cleanedText.byteInputStream()) df // get typed column as extension property df.name // select single column -> returns DataFrame df.select{name} // select several columns df.select{columns(name, age, embarked)} // another way to select columns without compile-time check df.select(df.name, df.age, df.sex) // select columns filtered by predicate df.selectIf{valueClass == String::class} // Row indexing df[1] // Column indexing df.name[0] // Same result df[0].name // filter rows by predicate. Predicate receiver is of type TypedDataFrameRow<*> with generated extension properties df.filter {sex == "female"} df.filter { age > 50 } // compilation error, because 'age' is a nullable property // filter rows where 'age' is not null. val withAges = df.filterNotNull {age} withAges // now filtration works withAges.filter {age > 50} // find the oldest survived woman withAges.filter {survived == 1 && sex == "female"}.maxBy{age} // sort by single column withAges.sortedByDesc {age} // sort by several columns withAges.sortedBy {columns(age, name)} // another way withAges.sortedBy(withAges.age, withAges.name) // add new column and store result in a new field val withYear = withAges.add("year") {1912 - age} withYear // check new column withYear.year // add several columns withAges.add { "year" {1912-age} "died" {survived == 0} } // plus is overloaded for adding columns withAges + { "year" {1912-age} "died" {survived == 0} } // another way to build new column via column arithmetics val birthYear = withAges.age * (-1) + 1912 // new column can be added to dataframe with '+' operator withAges + birthYear.rename("year") // Iterable of columns can also be added with '+' withAges + withAges.columns.map {it.rename(it.name + " duplicate")} // remove single column df.remove{ticket} // remove several columns df.remove {columns(row, pclass, ticket, room, survived)} // remove several columns by column instances df.remove(df.row, df.pclass) // '-' operator can also be used for removing columns df - {row} - {pclass} - {room} // group by single column df.groupBy{ embarked }.count() // group by several columns df.groupBy{ columns(sex, survived) }.count() // another way df.groupBy(df.sex, df.survived).count() // Various summarization operations on grouped data frame withAges.groupBy{ embarked }.summarize { "total count" { size } // lamba expressions are computed for every group. Type of receiver: TypedDataFrame<*> "survival rate" { count { survived == 1 }.toDouble() / size * 100 } "average age" { age.mean() } // column operations are also supported "median age" { age.median() } val youngest = find { minBy {age}!! } // 'find' builds data frame, collecting one row for every group "youngest" (youngest.name) // columns of collected data frame are passed in round parenthesis '()' "youngest age" (youngest.age) val oldest = find { maxBy {age}!! } "oldest" (oldest.name) "oldest age" (oldest.age) } df.size withAges.count {age > 50 } withAges.sortedBy{age}.take(5) withAges.sortedBy{age}.takeLast(5) // 'rows' field is Iterable> so it can be used in any stdlib extensions for Iterable df.rows.map {it.name}.take(5) // Sample List data class Item(val first: Int, val second: Double) val itemsList = listOf(Item(1,2.0), Item(2, 3.0), Item(3, 4.0)) // List -> DataFrame by reflection itemsList.toDataFrame() // List -> DataFrame by mappings itemsList.toDataFrame { "a" {first} "b" {second} "c" {first*second} } // Convert data frame to a list of data class items val passengers = df.toList("Passenger") // Check type of the element passengers[0].javaClass // Do any list operations passengers.maxBy {it.age ?: .0} // create marker interface to write column-specific extensions for data frame @DataFrameType interface SimplePerson { val name: String val age: Double } // create extension for any data frame with fields 'name' and 'age' fun TypedDataFrame.getOlderThan(minAge: Double) = filter {age > minAge} // extension works even for objects that were created before marker interface declaration withAges.getOlderThan(50.0) // code for marker interface can be auto-generated // 'getScheme' method returns generated code without execution withAges.select{columns(name,age,home,sex)}.generateInterface("Person") // 'extractScheme' method generates and executes code withAges.select{columns(name,age,home,sex)}.extractInterface("Person") // Now interface 'Person' is available, so we can write an extension method, // that will work for any data frame with these four columns fun TypedDataFrame.addSummary() = add("summary"){"$sex $name $age y.o. from $home"} // for example, it works for 'withAges' data frame withAges.addSummary() // data frame can also be converted to a list of objects implementing 'Person' interface that was generated above val persons = withAges.toList() // check element type persons[0].javaClass persons // When data frame variable is mutable, a strongly typed wrapper for it // is generated only once after the first execution of a cell where it is declared var nameAndSex = df.select(df.name, df.sex) nameAndSex // let's declare immutable variable, that contains all string columns val strings = df.selectIf{valueClass == String::class} strings // 'nameAndSex' is assignable from 'strings', // because 'strings' has all the columns that are required by type of 'nameAndSex' nameAndSex = strings // note, that the actual value of 'nameAndSex' is still a data frame of all string columns nameAndSex // but typed access to the fields works only for 'name' and 'sex' nameAndSex.sex // this is OK nameAndSex.home // this fails with compilation error nameAndSex["home"] // the requested column is still available by column name string // now let's create a variable with two other columns val nameAndHome = df.select(df.name, df.home) nameAndHome nameAndSex = nameAndHome // this assignment doesn't work because of columns mismatch // unfortunately, there is a way to get a runtime error here, // because typed wrappers are generated only after execution of a cell // so the following assigment will pass fine, because return type of 'select' is the same as in 'df' variable, // although the set of columns was reduced nameAndSex = df.select(df.name, df.home) // if we try to access the column, we get runtime error nameAndSex.sex