%use krangl
// in memory
val df: DataFrame = dataFrameOf(
"first_name", "last_name", "age", "weight")(
"Max", "Doe", 23, 55,
"Franz", "Smith", 23, 88,
"Horst", "Keanes", 12, 82
)
df
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
Horst | Keanes | 12 | 82 |
df.addColumn("salary_category") { 3 }
first_name | last_name | age | weight | salary_category |
---|---|---|---|---|
Max | Doe | 23 | 55 | 3 |
Franz | Smith | 23 | 88 | 3 |
Horst | Keanes | 12 | 82 | 3 |
df.addColumn("age_3y_later") { it["age"] + 3 }
first_name | last_name | age | weight | age_3y_later |
---|---|---|---|---|
Max | Doe | 23 | 55 | 26 |
Franz | Smith | 23 | 88 | 26 |
Horst | Keanes | 12 | 82 | 15 |
// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.
val newDF = df.addColumn("full_name") { it["first_name"] + " " + it["last_name"] }
newDF
first_name | last_name | age | weight | full_name |
---|---|---|---|---|
Max | Doe | 23 | 55 | Max Doe |
Franz | Smith | 23 | 88 | Franz Smith |
Horst | Keanes | 12 | 82 | Horst Keanes |
// Also feel free to mix types here since krangl overloads arithmetic operators like + for dataframe-columns
df.addColumn("user_id") { it["last_name"] + "_id" + rowNumber }
first_name | last_name | age | weight | user_id |
---|---|---|---|---|
Max | Doe | 23 | 55 | Doe_id1 |
Franz | Smith | 23 | 88 | Smith_id2 |
Horst | Keanes | 12 | 82 | Keanes_id3 |
// Create new attributes with string operations like matching, splitting or extraction.
df.addColumn("with_anz") { it["first_name"].asStrings().map { it!!.contains("anz") } }
first_name | last_name | age | weight | with_anz |
---|---|---|---|---|
Max | Doe | 23 | 55 | false |
Franz | Smith | 23 | 88 | true |
Horst | Keanes | 12 | 82 | false |
// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits
df.addColumn("first_name_initial") { it["first_name"].map<String>{ it.first() } }
first_name | last_name | age | weight | first_name_initial |
---|---|---|---|---|
Max | Doe | 23 | 55 | M |
Franz | Smith | 23 | 88 | F |
Horst | Keanes | 12 | 82 | H |
import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25
// or add multiple columns at once
df.addColumns(
"age_plus3" to { it["age"] + 3 },
"initials" to { it["first_name"].map<String> { it.first() } concat it["last_name"].map<String> { it.first() } }
)
first_name | last_name | age | weight | age_plus3 | initials |
---|---|---|---|---|---|
Max | Doe | 23 | 55 | 26 | M D |
Franz | Smith | 23 | 88 | 26 | F S |
Horst | Keanes | 12 | 82 | 15 | H K |
// Sort your data with sortedBy
df.sortedBy("age")
first_name | last_name | age | weight |
---|---|---|---|
Horst | Keanes | 12 | 82 |
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
// and add secondary sorting attributes as varargs
df.sortedBy("age", "weight")
first_name | last_name | age | weight |
---|---|---|---|
Horst | Keanes | 12 | 82 |
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
df.sortedByDescending("age")
first_name | last_name | age | weight |
---|---|---|---|
Franz | Smith | 23 | 88 |
Max | Doe | 23 | 55 |
Horst | Keanes | 12 | 82 |
df.sortedBy { it["weight"].asInts() }
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
Horst | Keanes | 12 | 82 |
Franz | Smith | 23 | 88 |
// Subset columns with select
df.selectIf { it is IntCol } // functional style column selection
age | weight |
---|---|
23 | 55 |
23 | 88 |
12 | 82 |
df.select("last_name", "weight") // positive selection
last_name | weight |
---|---|
Doe | 55 |
Smith | 88 |
Keanes | 82 |
df.remove("weight", "age") // negative selection
first_name | last_name |
---|---|
Max | Doe |
Franz | Smith |
Horst | Keanes |
df.select({ endsWith("name") }) // selector mini-language
first_name | last_name |
---|---|
Max | Doe |
Franz | Smith |
Horst | Keanes |
// Subset rows with vectorized filter
df.filter { it["age"] eq 23 }
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
df.filter { it["weight"] gt 60 }
first_name | last_name | age | weight |
---|---|---|---|
Franz | Smith | 23 | 88 |
Horst | Keanes | 12 | 82 |
df.filter({ it["last_name"].isMatching<String> { startsWith("Do") }})
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
// In case vectorized operations are not possible or available we can also filter tables by row
// which allows for scalar operators
df.filterByRow { it["age"] as Int > 20 }
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
df.filterByRow { (it["age"] as Int).rem(10) == 2 }
first_name | last_name | age | weight |
---|---|---|---|
Horst | Keanes | 12 | 82 |
// do simple cross tabulations
df.count("age", "last_name")
age | last_name | n |
---|---|---|
23 | Doe | 1 |
23 | Smith | 1 |
12 | Keanes | 1 |
import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25
// ... or calculate single summary statistic
df.summarize("mean_age" to { it["age"].mean(true) })
mean_age |
---|
19.333333333333332 |
import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25
// ... or multiple summary statistics
df.summarize(
"min_age" to { it["age"].min() },
"max_age" to { it["age"].max() }
)
min_age | max_age |
---|---|
12.0 | 23.0 |
// for sake of r and python adoptability you can also use `=` here
df.summarize(
"min_age" `=` { it["age"].min() },
"max_age" `=` { it["age"].max() }
)
min_age | max_age |
---|---|
12.0 | 23.0 |
import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25
// Grouped operations
val groupedDf: DataFrame = df.groupBy("age") // or provide multiple grouping attributes with varargs
val sumDF = groupedDf.summarize(
"mean_weight" to { it["weight"].mean(removeNA = true) },
"num_persons" to { nrow }
)
sumDF
age | mean_weight | num_persons |
---|---|---|
23 | 71.5 | 2 |
12 | 82.0 | 1 |
// Optionally ungroup the data
groupedDf.ungroup()
first_name | last_name | age | weight |
---|---|---|---|
Max | Doe | 23 | 55 |
Franz | Smith | 23 | 88 |
Horst | Keanes | 12 | 82 |
// Generate object bindings for kotlin.
data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)
val records = sumDF.rowsAs<Person>()
records
[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]
// Now we can use the krangl result table in a strongly typed way
records.first().mean_weight
71.5
// Vice versa we can also convert an existing set of objects into
records.asDataFrame()
age | mean_weight | num_persons |
---|---|---|
23 | 71.5 | 2 |
12 | 82.0 | 1 |
// to populate a data-frame with selected properties only, we can do
records.deparseRecords { mapOf("age" to it.age, "weight" to it.mean_weight) }
age | weight |
---|---|
23 | 71.5 |
12 | 82.0 |