spark.version
2.4.8
import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.{date_add, date_sub}
// note that the dates are just strings
val df = Seq(
("notebook", valueOf("2019-01-29 12:00:00")),
("notebook", valueOf("2019-01-01 00:00:00")),
("small_phone", valueOf("2019-01-15 23:00:00")),
("small_phone", valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")
df = [device: string, purchase_time: timestamp]
[device: string, purchase_time: timestamp]
%%dataframe
df
device | purchase_time |
---|---|
notebook | 2019-01-01 00:00:00.0 |
notebook | 2019-01-29 12:00:00.0 |
small_phone | 2019-01-01 09:00:00.0 |
small_phone | 2019-01-15 23:00:00.0 |
%%dataframe
df.withColumn("plus_2_days", date_add($"purchase_time",2))
device | purchase_time | plus_2_days |
---|---|---|
notebook | 2019-01-01 00:00:00.0 | 2019-01-03 |
notebook | 2019-01-29 12:00:00.0 | 2019-01-31 |
small_phone | 2019-01-01 09:00:00.0 | 2019-01-03 |
small_phone | 2019-01-15 23:00:00.0 | 2019-01-17 |
import java.sql.Date.valueOf
import org.apache.spark.sql.functions.datediff
val df = Seq(
("notebook", valueOf("2019-01-29"), valueOf("2019-02-10")),
("notebook", valueOf("2019-01-01"), valueOf("2019-01-15")),
("small_phone", valueOf("2019-01-15"), valueOf("2019-01-05")),
("small_phone", valueOf("2019-01-01"), valueOf("2019-01-20"))
).toDF("device", "purchase_date", "arrival_date").sort("device","purchase_date")
df = [device: string, purchase_date: date ... 1 more field]
[device: string, purchase_date: date ... 1 more field]
%%dataframe
df
device | purchase_date | arrival_date |
---|---|---|
notebook | 2019-01-01 | 2019-01-15 |
notebook | 2019-01-29 | 2019-02-10 |
small_phone | 2019-01-01 | 2019-01-20 |
small_phone | 2019-01-15 | 2019-01-05 |
%%dataframe
df.withColumn("days_to_arrive",datediff($"arrival_date", $"purchase_date"))
device | purchase_date | arrival_date | days_to_arrive |
---|---|---|---|
notebook | 2019-01-01 | 2019-01-15 | 14 |
notebook | 2019-01-29 | 2019-02-10 | 12 |
small_phone | 2019-01-01 | 2019-01-20 | 19 |
small_phone | 2019-01-15 | 2019-01-05 | -10 |
import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.unix_timestamp
val df = Seq(
("foo", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-01 01:00:00")), // 1 hour apart
("bar", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-02 00:00:00")), // 24 hours apart
("baz", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-07 00:00:00")) // 7 days apart
).toDF("col1", "purchase_time", "arrival_time").sort("col1", "purchase_time")
df = [col1: string, purchase_time: timestamp ... 1 more field]
[col1: string, purchase_time: timestamp ... 1 more field]
%%dataframe
df
col1 | purchase_time | arrival_time |
---|---|---|
bar | 2019-01-01 00:00:00.0 | 2019-01-02 00:00:00.0 |
baz | 2019-01-01 00:00:00.0 | 2019-01-07 00:00:00.0 |
foo | 2019-01-01 00:00:00.0 | 2019-01-01 01:00:00.0 |
%%dataframe
df.withColumn("diff_in_seconds_2", unix_timestamp($"arrival_time") - unix_timestamp($"purchase_time"))
col1 | purchase_time | arrival_time | diff_in_seconds_2 |
---|---|---|---|
bar | 2019-01-01 00:00:00.0 | 2019-01-02 00:00:00.0 | 86400 |
baz | 2019-01-01 00:00:00.0 | 2019-01-07 00:00:00.0 | 518400 |
foo | 2019-01-01 00:00:00.0 | 2019-01-01 01:00:00.0 | 3600 |
import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.to_timestamp
val df = Seq(
("foo", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:00:00.400")),
("bar", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:00:00.650")),
("baz", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:01:00.000"))
).toDF("col1", "time_before", "time_after")
df = [col1: string, time_before: timestamp ... 1 more field]
[col1: string, time_before: timestamp ... 1 more field]
%%dataframe
df
col1 | time_before | time_after |
---|---|---|
foo | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.4 |
bar | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.65 |
baz | 2019-01-01 00:00:00.0 | 2019-01-01 00:01:00.0 |
%%dataframe
%%scan
(df
.withColumn("diff_millis", ($"time_after".cast("double") - $"time_before".cast("double")))
.withColumn("diff_millis", ($"diff_millis"*1000).cast("long")))
col1 | time_before | time_after | diff_millis |
---|---|---|---|
foo | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.4 | 400 |
bar | 2019-01-01 00:00:00.0 | 2019-01-01 00:00:00.65 | 650 |
baz | 2019-01-01 00:00:00.0 | 2019-01-01 00:01:00.0 | 60000 |
import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.expr
val df = Seq(
("foo", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-10 01:00:00.000")), // exactly 1 hour
("bar", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-10 01:00:00.123")), // one hour and some millis
("baz", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-11 01:30:00.123")) // one day and one and a half hours
).toDF("col1", "purchase_time", "arrival_time")
df = [col1: string, purchase_time: timestamp ... 1 more field]
[col1: string, purchase_time: timestamp ... 1 more field]
%%dataframe
df
col1 | purchase_time | arrival_time |
---|---|---|
foo | 2019-10-10 00:00:00.0 | 2019-10-10 01:00:00.0 |
bar | 2019-10-10 00:00:00.0 | 2019-10-10 01:00:00.123 |
baz | 2019-10-10 00:00:00.0 | 2019-10-11 01:30:00.123 |
val seconds_in_hour = 60 * 60
seconds_in_hour = 3600
3600
%%dataframe
%%scan
(df
.withColumn("difference_in_seconds", $"arrival_time".cast("double") - $"purchase_time".cast("double"))
.withColumn("difference_in_hours", $"difference_in_seconds" / seconds_in_hour)
.select("col1", "purchase_time", "arrival_time", "difference_in_hours"))
col1 | purchase_time | arrival_time | difference_in_hours |
---|---|---|---|
foo | 2019-10-10 00:00:00.0 | 2019-10-10 01:00:00.0 | 1.0 |
bar | 2019-10-10 00:00:00.0 | 2019-10-10 01:00:00.123 | 1.0000341666407055 |
baz | 2019-10-10 00:00:00.0 | 2019-10-11 01:30:00.123 | 25.500034166640706 |
import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.expr
val df = Seq(
("foo", valueOf("2019-10-10 00:45:00")),
("bar", valueOf("2019-10-10 12:34:56")),
("baz", valueOf("2019-10-10 23:59:00"))
).toDF("col1", "timestamp_col")
df = [col1: string, timestamp_col: timestamp]
[col1: string, timestamp_col: timestamp]
%%dataframe
df
col1 | timestamp_col |
---|---|
foo | 2019-10-10 00:45:00.0 |
bar | 2019-10-10 12:34:56.0 |
baz | 2019-10-10 23:59:00.0 |
%%dataframe
%%scan
df
.withColumn("timestamp_minus_24_hours", $"timestamp_col" - expr("INTERVAL 24 HOURS"))
col1 | timestamp_col | timestamp_minus_24_hours |
---|---|---|
foo | 2019-10-10 00:45:00.0 | 2019-10-09 00:45:00.0 |
bar | 2019-10-10 12:34:56.0 | 2019-10-09 12:34:56.0 |
baz | 2019-10-10 23:59:00.0 | 2019-10-09 23:59:00.0 |