%use dataframe(0.14.0-RC1)
DataFrame.castTo(df)
DataFrame.readExcel new StringColumns parameter
DataFrame.generateDataClasses()
DataRowSchema API
Iterable
castTo
helps to reuse code in notebooks¶Let's say you have a few dataframes with the same schema. Often you'd need to execute the same code for each of them. Why? For example, to draw a plot for different periods of time, or process and print information from different sources.
val sample = DataFrame.readDelimStr("""full_name,html_url,stargazers_count,topics,watchers
JetBrains/JPS,https://github.com/JetBrains/JPS,23,"[build-system]",23
""")
sample
full_name | html_url | stargazers_count | topics | watchers |
---|---|---|---|---|
JetBrains/JPS | https://github.com/JetBrains/JPS | 23 | [build-system] | 23 |
// Function will throw an exception if schema of `this` don't match schema of `sample`
fun AnyFrame.top10() = castTo(sample).sortByDesc { stargazers_count }.take(10)
val repos = DataFrame.read("https://raw.githubusercontent.com/Kotlin/dataframe/f72655be9a6235eefa183de22f4e1c94ac539f02/data/jetbrains_repositories.csv")
repos.top10()
full_name | html_url | stargazers_count | topics | watchers |
---|---|---|---|---|
JetBrains/kotlin | https://github.com/JetBrains/kotlin | 39402 | [compiler, gradle-plugin, intellij-pl... | 39402 |
JetBrains/intellij-community | https://github.com/JetBrains/intellij... | 12926 | [code-editor, ide, intellij, intellij... | 12926 |
JetBrains/kotlin-native | https://github.com/JetBrains/kotlin-n... | 7101 | [c, compiler, kotlin, llvm, objective-c] | 7101 |
JetBrains/compose-jb | https://github.com/JetBrains/compose-jb | 6805 | [android, awt, compose, declarative-u... | 6805 |
JetBrains/ideavim | https://github.com/JetBrains/ideavim | 6120 | [ideavim, intellij, intellij-platform... | 6120 |
JetBrains/JetBrainsMono | https://github.com/JetBrains/JetBrain... | 6059 | [coding-font, font, ligatures, monosp... | 6059 |
JetBrains/Exposed | https://github.com/JetBrains/Exposed | 5688 | [dao, kotlin, orm, sql] | 5688 |
JetBrains/ring-ui | https://github.com/JetBrains/ring-ui | 2836 | [components, jetbrains-ui, react] | 2836 |
JetBrains/kotlinconf-app | https://github.com/JetBrains/kotlinco... | 2628 | [] | 2628 |
JetBrains/create-react-kotlin-app | https://github.com/JetBrains/create-r... | 2424 | [create-react-app, jetbrains-ui, kotl... | 2424 |
// If there are mismatches in columns function shouldn't care about, use verify = false
fun AnyFrame.top10_noVerify() = castTo(sample, verify = false).sortByDesc { stargazers_count }.take(10)
repos.update { watchers }.withNull().top10_noVerify()
full_name | html_url | stargazers_count | topics | watchers |
---|---|---|---|---|
JetBrains/kotlin | https://github.com/JetBrains/kotlin | 39402 | [compiler, gradle-plugin, intellij-pl... | null |
JetBrains/intellij-community | https://github.com/JetBrains/intellij... | 12926 | [code-editor, ide, intellij, intellij... | null |
JetBrains/kotlin-native | https://github.com/JetBrains/kotlin-n... | 7101 | [c, compiler, kotlin, llvm, objective-c] | null |
JetBrains/compose-jb | https://github.com/JetBrains/compose-jb | 6805 | [android, awt, compose, declarative-u... | null |
JetBrains/ideavim | https://github.com/JetBrains/ideavim | 6120 | [ideavim, intellij, intellij-platform... | null |
JetBrains/JetBrainsMono | https://github.com/JetBrains/JetBrain... | 6059 | [coding-font, font, ligatures, monosp... | null |
JetBrains/Exposed | https://github.com/JetBrains/Exposed | 5688 | [dao, kotlin, orm, sql] | null |
JetBrains/ring-ui | https://github.com/JetBrains/ring-ui | 2836 | [components, jetbrains-ui, react] | null |
JetBrains/kotlinconf-app | https://github.com/JetBrains/kotlinco... | 2628 | [] | null |
JetBrains/create-react-kotlin-app | https://github.com/JetBrains/create-r... | 2424 | [create-react-app, jetbrains-ui, kotl... | null |
// If types of columns that function uses don't match and verify = false, expect NPE and ClassCast exceptions
fun AnyFrame.filter_noVerify() = castTo(sample, verify = false).filter { watchers > 10 }.take(10)
// exception is expected here
repos.update { watchers }.withNull().filter_noVerify()
java.lang.NullPointerException: null cannot be cast to non-null type kotlin.Int at Line_7_jupyter._DataFrameType_watchers(Line_7.jupyter.kts:19) at Cell In[0], line 19 at Line_16_jupyter$filter_noVerify$1.invoke(Line_16.jupyter.kts:2) at Cell In[6], line 2 at Line_16_jupyter$filter_noVerify$1.invoke(Line_16.jupyter.kts:2) at Cell In[6], line 2 at org.jetbrains.kotlinx.dataframe.api.FilterKt.filter(filter.kt:39) at Line_16_jupyter.filter_noVerify(Line_16.jupyter.kts:2) at Cell In[6], line 2 at Line_16_jupyter.<init>(Line_16.jupyter.kts:5) at Cell In[6], line 5 at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499) at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:122) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:48) at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt) at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49) at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:133) at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33) at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:104) at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:277) at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:95) at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:69) at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source) at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:48) at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source) at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:133) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:80) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:78) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:742) at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute-L4Nmkdk(CellExecutorImpl.kt:78) at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute-L4Nmkdk$default(CellExecutor.kt:13) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode-wNURfNM(ReplForJupyterImpl.kt:565) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalExImpl(ReplForJupyterImpl.kt:423) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evalExImpl(ReplForJupyterImpl.kt:139) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:416) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:415) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:396) at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:415) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:170) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:169) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedIn(IdeCompatibleMessageRequestProcessor.kt:342) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedIn(IdeCompatibleMessageRequestProcessor.kt:66) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:356) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedErr(IdeCompatibleMessageRequestProcessor.kt:331) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.access$withForkedErr(IdeCompatibleMessageRequestProcessor.kt:66) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$evalWithIO$1.invoke(IdeCompatibleMessageRequestProcessor.kt:355) at org.jetbrains.kotlinx.jupyter.streams.BlockingSubstitutionEngine.withDataSubstitution(SubstitutionEngine.kt:70) at org.jetbrains.kotlinx.jupyter.streams.StreamSubstitutionManager.withSubstitutedStreams(StreamSubstitutionManager.kt:118) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.withForkedOut(IdeCompatibleMessageRequestProcessor.kt:323) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor.evalWithIO(IdeCompatibleMessageRequestProcessor.kt:354) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:169) at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1.invoke(IdeCompatibleMessageRequestProcessor.kt:168) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$Task.execute(JupyterExecutorImpl.kt:41) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:81) at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$executorThread$1.invoke(JupyterExecutorImpl.kt:79) at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30) java.lang.NullPointerException: null cannot be cast to non-null type kotlin.Int at Cell In[0], line 19
val repos1 = repos.groupBy { expr("thousands") { stargazers_count / 1000 } }.toDataFrame()
repos1.schema()
thousands: Int group: * full_name: String html_url: URL stargazers_count: Int topics: String watchers: Int
repos1.generateDataClasses(markerName = "RepositoriesData")
@DataSchema data class RepositoriesData1( @ColumnName("full_name") val fullName: String, @ColumnName("html_url") val htmlUrl: java.net.URL, @ColumnName("stargazers_count") val stargazersCount: Int, val topics: String, val watchers: Int ) @DataSchema data class RepositoriesData( val group: List<RepositoriesData1>, val thousands: Int )
// There's no need to copy-paste generated code into the notebook. It can be executed:
EXECUTE(repos1.generateDataClasses(markerName = "RepositoriesData").value)
A dataframe can then be converted to these data classes
val repositoriesDataList = repos1.toListOf<RepositoriesData>()
repositoriesDataList.forEach {
println(it.thousands)
}
0 6 1 12 39 5 7 2
A list of data class instances can be converted to DataFrame too
val dataframe = RepositoriesData1(
"Kotlin/dataframe",
URL("https://github.com/Kotlin/dataframe"),
stargazersCount = 773,
topics = "[kotlin, data-science, data-analysis, dataframe]",
watchers = 16
)
val repos2 = listOf(RepositoriesData(listOf(dataframe), 0)).toDataFrame()
repos2
group | thousands | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
DataFrame [1 x 5]
| 0 |
repos2.schema() == repos1.schema()
true
We introduce a marker interface that data schema classes can implement. It will enable two new API: dataFrameOf and append. With the compiler plugin, this interface will be automatically added to any declaration annotated with @DataSchema. For now let's see what it does:
@DataSchema
data class RepositoriesData1(
@ColumnName("full_name")
val fullName: String,
@ColumnName("html_url")
val htmlUrl: java.net.URL,
@ColumnName("stargazers_count")
val stargazersCount: Int,
val topics: String,
val watchers: Int
)
@DataSchema
data class RepositoriesData(
val group: List<RepositoriesData1>,
val thousands: Int
) : DataRowSchema // <<-- New marker interface
val element = RepositoriesData1(
"Kotlin/dataframe",
URL("https://github.com/Kotlin/dataframe"),
stargazersCount = 773,
topics = "[kotlin, data-science, data-analysis, dataframe]",
watchers = 16
)
val df = dataFrameOf(RepositoriesData(listOf(element), 0)) // shorter compared to listOf().toDataFrame() above
df.append(RepositoriesData(listOf(element), 0)) // new typed flavor of append
group | thousands | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
DataFrame [1 x 5]
| 0 | ||||||||||
DataFrame [1 x 5]
| 0 |
// By default readExcel relies on cell value types
val excel1 = DataFrame.readExcel("mixed_column.xlsx")
excel1
2024-09-17T13:59:21.670459871Z Execution of code '// By default readEx...' ERROR Log4j2 could not find a logging implementation. Please add log4j-core to the classpath. Using SimpleLogger to log to the console...
col1 |
---|
100.000000 |
A100 |
B100 |
C100 |
// col1 is Double & String
excel1.schema()
col1: Comparable<*>
val excel2 = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
excel2
col1 |
---|
100 |
A100 |
B100 |
C100 |
excel2.schema()
col1: String
This is an easy way to create a DataFrame when you have a list of Files, URLs, or a structure
you want to extract data from.
In a notebook,
it can be convenient to start from the column of these values to see the number of rows, their toString
in a table
and then iteratively add columns with the parts of the data you're interested in.
It could be a File's content, a specific section of an HTML document, some metadata, etc.
import kotlin.io.path.Path
import kotlin.io.path.listDirectoryEntries
val csvs = Path(".").listDirectoryEntries("*.csv").toDataFrame(columnName = "file")
csvs
file |
---|
./data.csv |
./data1.csv |
import kotlin.io.path.fileSize
csvs.add {
"size" from { file.fileSize() }
"data" from { file.toFile().readDataFrame() }
}
file | size | data | ||||||
---|---|---|---|---|---|---|---|---|
./data.csv | 12 | DataFrame [1 x 3]
| ||||||
./data1.csv | 27 | DataFrame [1 x 3]
|