%use dataframe, khttp

// to see autogenerated code, uncomment the line below:
//%trackExecution -generated

val response = khttp.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
val cleanedText = response.text.replace("\"Molly\"", "Molly").replace("row.names", "row").replace("home.dest", "home")

// convert data to dataframe, generate marker interface for typed data frame wrapper and extension properties for it
val df = DataFrame.readDelim(cleanedText.byteInputStream())

df

// get typed column as extension property
df.name

// select single column -> returns DataFrame
df.select{name}

// select several columns
df.select{columns(name, age, embarked)}

// another way to select columns without compile-time check
df.select(df.name, df.age, df.sex)

// select columns filtered by predicate
df.selectIf{valueClass == String::class}

// Row indexing
df[1]

// Column indexing
df.name[0]

// Same result
df[0].name

// filter rows by predicate. Predicate receiver is of type TypedDataFrameRow<*> with generated extension properties
df.filter {sex == "female"}

df.filter { age > 50 } // compilation error, because 'age' is a nullable property

// filter rows where 'age' is not null. 
val withAges = df.filterNotNull {age}
withAges

// now filtration works
withAges.filter {age > 50}

// find the oldest survived woman
withAges.filter {survived == 1 && sex == "female"}.maxBy{age}

// sort by single column
withAges.sortedByDesc {age}

// sort by several columns
withAges.sortedBy {columns(age, name)}

// another way
withAges.sortedBy(withAges.age, withAges.name)

// add new column and store result in a new field
val withYear = withAges.add("year") {1912 - age}
withYear

// check new column
withYear.year

// add several columns
withAges.add {
    "year" {1912-age}
    "died" {survived == 0}
}

// plus is overloaded for adding columns
withAges + {
    "year" {1912-age}
    "died" {survived == 0}
}

// another way to build new column via column arithmetics
val birthYear = withAges.age * (-1) + 1912

// new column can be added to dataframe with '+' operator
withAges + birthYear.rename("year")

// Iterable of columns can also be added with '+' 
withAges + withAges.columns.map {it.rename(it.name + " duplicate")}

// remove single column
df.remove{ticket}

// remove several columns
df.remove {columns(row, pclass, ticket, room, survived)}

// remove several columns by column instances
df.remove(df.row, df.pclass)

// '-' operator can also be used for removing columns
df - {row} - {pclass} - {room}

// group by single column
df.groupBy{ embarked }.count()

// group by several columns
df.groupBy{ columns(sex, survived) }.count()

// another way
df.groupBy(df.sex, df.survived).count()

// Various summarization operations on grouped data frame
withAges.groupBy{ embarked }.summarize {
    
    "total count" { size } // lamba expressions are computed for every group. Type of receiver: TypedDataFrame<*>
    "survival rate" { count { survived == 1 }.toDouble() / size * 100 }
    
    "average age" { age.mean() } // column operations are also supported
    "median age" { age.median() }
    
    val youngest = find { minBy {age}!! } // 'find' builds data frame, collecting one row for every group
    "youngest" (youngest.name) // columns of collected data frame are passed in round parenthesis '()'
    "youngest age" (youngest.age)
    
     val oldest = find { maxBy {age}!! }
    "oldest" (oldest.name)
    "oldest age" (oldest.age)
}

df.size

withAges.count {age > 50 }

withAges.sortedBy{age}.take(5)

withAges.sortedBy{age}.takeLast(5)

// 'rows' field is Iterable<TypedDataFrameRow<*>> so it can be used in any stdlib extensions for Iterable
df.rows.map {it.name}.take(5)

// Sample List
data class Item(val first: Int, val second: Double)
val itemsList = listOf(Item(1,2.0), Item(2, 3.0), Item(3, 4.0))

// List -> DataFrame by reflection
itemsList.toDataFrame()

// List -> DataFrame by mappings
itemsList.toDataFrame {
    "a" {first}
    "b" {second}
    "c" {first*second}
}

// Convert data frame to a list of data class items
val passengers = df.toList("Passenger")

// Check type of the element
passengers[0].javaClass

// Do any list operations
passengers.maxBy {it.age ?: .0}

// create marker interface to write column-specific extensions for data frame
@DataFrameType
interface SimplePerson {
    val name: String
    val age: Double
}

// create extension for any data frame with fields 'name' and 'age'
fun TypedDataFrame<SimplePerson>.getOlderThan(minAge: Double) = filter {age > minAge}

// extension works even for objects that were created before marker interface declaration
withAges.getOlderThan(50.0)

// code for marker interface can be auto-generated
// 'getScheme' method returns generated code without execution
withAges.select{columns(name,age,home,sex)}.generateInterface("Person")

// 'extractScheme' method generates and executes code
withAges.select{columns(name,age,home,sex)}.extractInterface("Person")

// Now interface 'Person' is available, so we can write an extension method, 
// that will work for any data frame with these four columns
fun TypedDataFrame<Person>.addSummary() = add("summary"){"$sex $name $age y.o. from $home"}

// for example, it works for 'withAges' data frame
withAges.addSummary()

// data frame can also be converted to a list of objects implementing 'Person' interface that was generated above
val persons = withAges.toList<Person>()

// check element type
persons[0].javaClass

persons

// When data frame variable is mutable, a strongly typed wrapper for it 
// is generated only once after the first execution of a cell where it is declared
var nameAndSex = df.select(df.name, df.sex)
nameAndSex

// let's declare immutable variable, that contains all string columns
val strings = df.selectIf{valueClass == String::class}
strings

// 'nameAndSex' is assignable from 'strings', 
// because 'strings' has all the columns that are required by type of 'nameAndSex'
nameAndSex = strings

// note, that the actual value of 'nameAndSex' is still a data frame of all string columns
nameAndSex

// but typed access to the fields works only for 'name' and 'sex'
nameAndSex.sex // this is OK

nameAndSex.home // this fails with compilation error

nameAndSex["home"] // the requested column is still available by column name string

// now let's create a variable with two other columns
val nameAndHome = df.select(df.name, df.home)
nameAndHome

nameAndSex = nameAndHome // this assignment doesn't work because of columns mismatch

// unfortunately, there is a way to get a runtime error here, 
// because typed wrappers are generated only after execution of a cell

// so the following assigment will pass fine, because return type of 'select' is the same as in 'df' variable, 
// although the set of columns was reduced
nameAndSex = df.select(df.name, df.home) 

// if we try to access the column, we get runtime error
nameAndSex.sex