In [1]:

%use krangl

Create¶

In [2]:

// in memory
val df: DataFrame = dataFrameOf(
    "first_name", "last_name", "age", "weight")(
    "Max", "Doe", 23, 55,
    "Franz", "Smith", 23, 88,
    "Horst", "Keanes", 12, 82
)
df

Out[2]:

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88
Horst	Keanes	12	82

Add columns¶

In [3]:

df.addColumn("salary_category") { 3 }

Out[3]:

first_name	last_name	age	weight	salary_category
Max	Doe	23	55	3
Franz	Smith	23	88	3
Horst	Keanes	12	82	3

In [4]:

df.addColumn("age_3y_later") { it["age"] + 3 }

Out[4]:

first_name	last_name	age	weight	age_3y_later
Max	Doe	23	55	26
Franz	Smith	23	88	26
Horst	Keanes	12	82	15

In [5]:

// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.
val newDF = df.addColumn("full_name") { it["first_name"] + " " + it["last_name"] }
newDF

Out[5]:

first_name	last_name	age	weight	full_name
Max	Doe	23	55	Max Doe
Franz	Smith	23	88	Franz Smith
Horst	Keanes	12	82	Horst Keanes

In [6]:

// Also feel free to mix types here since krangl overloads  arithmetic operators like + for dataframe-columns
df.addColumn("user_id") { it["last_name"] + "_id" + rowNumber }

Out[6]:

first_name	last_name	age	weight	user_id
Max	Doe	23	55	Doe_id1
Franz	Smith	23	88	Smith_id2
Horst	Keanes	12	82	Keanes_id3

In [7]:

// Create new attributes with string operations like matching, splitting or extraction.
df.addColumn("with_anz") { it["first_name"].asStrings().map { it!!.contains("anz") } }

Out[7]:

first_name	last_name	age	weight	with_anz
Max	Doe	23	55	false
Franz	Smith	23	88	true
Horst	Keanes	12	82	false

In [8]:

// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits
df.addColumn("first_name_initial") { it["first_name"].map<String>{ it.first() } }

Out[8]:

first_name	last_name	age	weight	first_name_initial
Max	Doe	23	55	M
Franz	Smith	23	88	F
Horst	Keanes	12	82	H

In [9]:

import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25

// or add multiple columns at once
df.addColumns(
    "age_plus3" to { it["age"] + 3 },
    "initials" to { it["first_name"].map<String> { it.first() } concat it["last_name"].map<String> { it.first() } }
)

Out[9]:

first_name	last_name	age	weight	age_plus3	initials
Max	Doe	23	55	26	M D
Franz	Smith	23	88	26	F S
Horst	Keanes	12	82	15	H K

Sort¶

In [10]:

// Sort your data with sortedBy
df.sortedBy("age")

Out[10]:

first_name	last_name	age	weight
Horst	Keanes	12	82
Max	Doe	23	55
Franz	Smith	23	88

In [11]:

// and add secondary sorting attributes as varargs
df.sortedBy("age", "weight")

Out[11]:

first_name	last_name	age	weight
Horst	Keanes	12	82
Max	Doe	23	55
Franz	Smith	23	88

In [12]:

df.sortedByDescending("age")

Out[12]:

first_name	last_name	age	weight
Franz	Smith	23	88
Max	Doe	23	55
Horst	Keanes	12	82

In [13]:

df.sortedBy { it["weight"].asInts() }

Out[13]:

first_name	last_name	age	weight
Max	Doe	23	55
Horst	Keanes	12	82
Franz	Smith	23	88

Select¶

In [14]:

// Subset columns with select
df.selectIf { it is IntCol } // functional style column selection

Out[14]:

age	weight
23	55
23	88
12	82

In [15]:

df.select("last_name", "weight")    // positive selection

Out[15]:

last_name	weight
Doe	55
Smith	88
Keanes	82

In [16]:

df.remove("weight", "age")  // negative selection

Out[16]:

first_name	last_name
Max	Doe
Franz	Smith
Horst	Keanes

In [17]:

df.select({ endsWith("name") })    // selector mini-language

Out[17]:

first_name	last_name
Max	Doe
Franz	Smith
Horst	Keanes

In [18]:

// Subset rows with vectorized filter
df.filter { it["age"] eq 23 }

Out[18]:

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88

In [19]:

df.filter { it["weight"] gt 60 }

Out[19]:

first_name	last_name	age	weight
Franz	Smith	23	88
Horst	Keanes	12	82

In [20]:

df.filter({ it["last_name"].isMatching<String> { startsWith("Do")  }})

Out[20]:

first_name	last_name	age	weight
Max	Doe	23	55

In [21]:

// In case vectorized operations are not possible or available we can also filter tables by row
// which allows for scalar operators
df.filterByRow { it["age"] as Int > 20 }

Out[21]:

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88

In [22]:

df.filterByRow { (it["age"] as Int).rem(10) == 2 }

Out[22]:

first_name	last_name	age	weight
Horst	Keanes	12	82

Summarize¶

In [23]:

// do simple cross tabulations
df.count("age", "last_name")

Out[23]:

age	last_name	n
23	Doe	1
23	Smith	1
12	Keanes	1

In [24]:

import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25

// ... or calculate single summary statistic
df.summarize("mean_age" to { it["age"].mean(true) })

Out[24]:

mean_age
19.333333333333332

In [25]:

import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25

// ... or multiple summary statistics
df.summarize(
    "min_age" to { it["age"].min() },
    "max_age" to { it["age"].max() }
)

Out[25]:

min_age	max_age
12.0	23.0

In [26]:

// for sake of r and python adoptability you can also use `=` here
df.summarize(
    "min_age" `=` { it["age"].min() },
    "max_age" `=` { it["age"].max() }
)

Out[26]:

min_age	max_age
12.0	23.0

Group¶

In [27]:

import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25

// Grouped operations
val groupedDf: DataFrame = df.groupBy("age") // or provide multiple grouping attributes with varargs

val sumDF = groupedDf.summarize(
    "mean_weight" to { it["weight"].mean(removeNA = true) },
    "num_persons" to { nrow }
)
sumDF

Out[27]:

age	mean_weight	num_persons
23	71.5	2
12	82.0	1

In [28]:

// Optionally ungroup the data
groupedDf.ungroup()

Out[28]:

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88
Horst	Keanes	12	82

In [29]:

// Generate object bindings for kotlin.
data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)
val records = sumDF.rowsAs<Person>()

records

Out[29]:

[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]

In [30]:

// Now we can use the krangl result table in a strongly typed way
records.first().mean_weight

Out[30]:

71.5

In [31]:

// Vice versa we can also convert an existing set of objects into
records.asDataFrame()

Out[31]:

age	mean_weight	num_persons
23	71.5	2
12	82.0	1

In [32]:

// to populate a data-frame with selected properties only, we can do
records.deparseRecords { mapOf("age" to it.age, "weight" to it.mean_weight) }

Out[32]:

age	weight
23	71.5
12	82.0