In [1]:

spark.version

Out[1]:

2.4.8

date_add / date_sub¶

In [2]:

import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.{date_add, date_sub}

// note that the dates are just strings
val df = Seq(
    ("notebook",    valueOf("2019-01-29 12:00:00")),
    ("notebook",    valueOf("2019-01-01 00:00:00")),
    ("small_phone", valueOf("2019-01-15 23:00:00")),
    ("small_phone", valueOf("2019-01-01 09:00:00"))
).toDF("device", "purchase_time").sort("device","purchase_time")

df = [device: string, purchase_time: timestamp]

Out[2]:

[device: string, purchase_time: timestamp]

In [3]:

%%dataframe
df

Out[3]:

device	purchase_time
notebook	2019-01-01 00:00:00.0
notebook	2019-01-29 12:00:00.0
small_phone	2019-01-01 09:00:00.0
small_phone	2019-01-15 23:00:00.0

In [4]:

%%dataframe
df.withColumn("plus_2_days", date_add($"purchase_time",2))

Out[4]:

device	purchase_time	plus_2_days
notebook	2019-01-01 00:00:00.0	2019-01-03
notebook	2019-01-29 12:00:00.0	2019-01-31
small_phone	2019-01-01 09:00:00.0	2019-01-03
small_phone	2019-01-15 23:00:00.0	2019-01-17

datediff¶

In [5]:

import java.sql.Date.valueOf
import org.apache.spark.sql.functions.datediff

val df = Seq(
    ("notebook",    valueOf("2019-01-29"), valueOf("2019-02-10")),
    ("notebook",    valueOf("2019-01-01"), valueOf("2019-01-15")),
    ("small_phone", valueOf("2019-01-15"), valueOf("2019-01-05")),
    ("small_phone", valueOf("2019-01-01"), valueOf("2019-01-20"))
).toDF("device", "purchase_date", "arrival_date").sort("device","purchase_date")

df = [device: string, purchase_date: date ... 1 more field]

Out[5]:

[device: string, purchase_date: date ... 1 more field]

In [6]:

%%dataframe
df

Out[6]:

device	purchase_date	arrival_date
notebook	2019-01-01	2019-01-15
notebook	2019-01-29	2019-02-10
small_phone	2019-01-01	2019-01-20
small_phone	2019-01-15	2019-01-05

In [7]:

%%dataframe
df.withColumn("days_to_arrive",datediff($"arrival_date", $"purchase_date"))

Out[7]:

device	purchase_date	arrival_date	days_to_arrive
notebook	2019-01-01	2019-01-15	14
notebook	2019-01-29	2019-02-10	12
small_phone	2019-01-01	2019-01-20	19
small_phone	2019-01-15	2019-01-05	-10

difference in seconds¶

In [8]:

import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.unix_timestamp

val df = Seq(
    ("foo", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-01 01:00:00")), // 1 hour apart
    ("bar", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-02 00:00:00")), // 24 hours apart
    ("baz", valueOf("2019-01-01 00:00:00"), valueOf("2019-01-07 00:00:00"))  // 7 days apart
).toDF("col1", "purchase_time", "arrival_time").sort("col1", "purchase_time")

df = [col1: string, purchase_time: timestamp ... 1 more field]

Out[8]:

[col1: string, purchase_time: timestamp ... 1 more field]

In [9]:

%%dataframe
df

Out[9]:

col1	purchase_time	arrival_time
bar	2019-01-01 00:00:00.0	2019-01-02 00:00:00.0
baz	2019-01-01 00:00:00.0	2019-01-07 00:00:00.0
foo	2019-01-01 00:00:00.0	2019-01-01 01:00:00.0

In [10]:

%%dataframe

df.withColumn("diff_in_seconds_2", unix_timestamp($"arrival_time") - unix_timestamp($"purchase_time"))

Out[10]:

col1	purchase_time	arrival_time	diff_in_seconds_2
bar	2019-01-01 00:00:00.0	2019-01-02 00:00:00.0	86400
baz	2019-01-01 00:00:00.0	2019-01-07 00:00:00.0	518400
foo	2019-01-01 00:00:00.0	2019-01-01 01:00:00.0	3600

difference in milliseconds¶

In [11]:

import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.to_timestamp

val df = Seq(
    ("foo", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:00:00.400")), 
    ("bar", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:00:00.650")), 
    ("baz", valueOf("2019-01-01 00:00:00.000"), valueOf("2019-01-01 00:01:00.000")) 
).toDF("col1", "time_before", "time_after")

df = [col1: string, time_before: timestamp ... 1 more field]

Out[11]:

[col1: string, time_before: timestamp ... 1 more field]

In [12]:

%%dataframe
df

Out[12]:

col1	time_before	time_after
foo	2019-01-01 00:00:00.0	2019-01-01 00:00:00.4
bar	2019-01-01 00:00:00.0	2019-01-01 00:00:00.65
baz	2019-01-01 00:00:00.0	2019-01-01 00:01:00.0

In [13]:

%%dataframe
%%scan

(df
.withColumn("diff_millis", ($"time_after".cast("double") - $"time_before".cast("double")))
.withColumn("diff_millis", ($"diff_millis"*1000).cast("long")))

Out[13]:

col1	time_before	time_after	diff_millis
foo	2019-01-01 00:00:00.0	2019-01-01 00:00:00.4	400
bar	2019-01-01 00:00:00.0	2019-01-01 00:00:00.65	650
baz	2019-01-01 00:00:00.0	2019-01-01 00:01:00.0	60000

difference in hours¶

In [52]:

import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.expr

val df = Seq(
    ("foo", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-10 01:00:00.000")), // exactly 1 hour 
    ("bar", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-10 01:00:00.123")), // one hour and some millis   
    ("baz", valueOf("2019-10-10 00:00:00.000"), valueOf("2019-10-11 01:30:00.123")) // one day and one and a half hours
).toDF("col1", "purchase_time", "arrival_time")

df = [col1: string, purchase_time: timestamp ... 1 more field]

Out[52]:

[col1: string, purchase_time: timestamp ... 1 more field]

In [53]:

%%dataframe
df

Out[53]:

col1	purchase_time	arrival_time
foo	2019-10-10 00:00:00.0	2019-10-10 01:00:00.0
bar	2019-10-10 00:00:00.0	2019-10-10 01:00:00.123
baz	2019-10-10 00:00:00.0	2019-10-11 01:30:00.123

In [54]:

val seconds_in_hour = 60 * 60

seconds_in_hour = 3600

Out[54]:

In [55]:

%%dataframe
%%scan

(df
.withColumn("difference_in_seconds", $"arrival_time".cast("double") - $"purchase_time".cast("double"))
.withColumn("difference_in_hours", $"difference_in_seconds" / seconds_in_hour)
.select("col1", "purchase_time", "arrival_time", "difference_in_hours"))

Out[55]:

col1	purchase_time	arrival_time	difference_in_hours
foo	2019-10-10 00:00:00.0	2019-10-10 01:00:00.0	1.0
bar	2019-10-10 00:00:00.0	2019-10-10 01:00:00.123	1.0000341666407055
baz	2019-10-10 00:00:00.0	2019-10-11 01:30:00.123	25.500034166640706

expr interval¶

In [14]:

import java.sql.Timestamp.valueOf
import org.apache.spark.sql.functions.expr

val df = Seq(
    ("foo", valueOf("2019-10-10 00:45:00")), 
    ("bar", valueOf("2019-10-10 12:34:56")), 
    ("baz", valueOf("2019-10-10 23:59:00")) 
).toDF("col1", "timestamp_col")

df = [col1: string, timestamp_col: timestamp]

Out[14]:

[col1: string, timestamp_col: timestamp]

In [15]:

%%dataframe
df

Out[15]:

col1	timestamp_col
foo	2019-10-10 00:45:00.0
bar	2019-10-10 12:34:56.0
baz	2019-10-10 23:59:00.0

In [16]:

%%dataframe
%%scan

df
.withColumn("timestamp_minus_24_hours", $"timestamp_col" - expr("INTERVAL 24 HOURS"))

Out[16]:

col1	timestamp_col	timestamp_minus_24_hours
foo	2019-10-10 00:45:00.0	2019-10-09 00:45:00.0
bar	2019-10-10 12:34:56.0	2019-10-09 12:34:56.0
baz	2019-10-10 23:59:00.0	2019-10-09 23:59:00.0

In [ ]: