import java.sql.{Date,Timestamp}
spark.version
2.4.3
import org.apache.spark.sql.functions.when
val df = Seq(
("notebook","2019-01-19"),
("notebook", "2019-01-10"),
("small_phone", "2019-01-15"),
("small_phone", "2019-01-30"),
("tv_set", "2019-01-22"),
("large_phone", "2019-01-30"),
("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
lastException: Throwable = null
[purchase_date: string, device: string]
%%dataframe
df
purchase_date | device |
---|---|
2019-01-30 | large_phone |
2019-01-22 | medium_phone |
2019-01-10 | notebook |
2019-01-19 | notebook |
2019-01-15 | small_phone |
2019-01-30 | small_phone |
2019-01-22 | tv_set |
%%dataframe
df.withColumn("is_phone",when($"device".endsWith("phone"),true).otherwise(false))
purchase_date | device | is_phone |
---|---|---|
2019-01-30 | large_phone | true |
2019-01-22 | medium_phone | true |
2019-01-10 | notebook | false |
2019-01-19 | notebook | false |
2019-01-15 | small_phone | true |
2019-01-30 | small_phone | true |
2019-01-22 | tv_set | false |
import org.apache.spark.sql.functions.{element_at,split,when}
val df = Seq(
("notebook","2019-01-19"),
("notebook", "2019-01-10"),
("small_phone", "2019-01-15"),
("small_phone", "2019-01-30"),
("tv_set", "2019-01-22"),
("large_phone", "2019-01-30"),
("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
[purchase_date: string, device: string]
%%dataframe
df
purchase_date | device |
---|---|
2019-01-30 | large_phone |
2019-01-22 | medium_phone |
2019-01-10 | notebook |
2019-01-19 | notebook |
2019-01-15 | small_phone |
2019-01-30 | small_phone |
2019-01-22 | tv_set |
%%dataframe
df.withColumn("phone_size", when($"device".endsWith("phone"), element_at(split($"device","_"),1)))
purchase_date | device | phone_size |
---|---|---|
2019-01-30 | large_phone | large |
2019-01-22 | medium_phone | medium |
2019-01-10 | notebook | null |
2019-01-19 | notebook | null |
2019-01-15 | small_phone | small |
2019-01-30 | small_phone | small |
2019-01-22 | tv_set | null |
import org.apache.spark.sql.functions.when
val df = Seq(
("notebook","2019-01-19"),
("notebook", "2019-01-10"),
("small_phone", "2019-01-15"),
("small_phone", "2019-01-30"),
("tv_set", "2019-01-22"),
("large_phone", "2019-01-30"),
("medium_phone", "2019-01-22")
).toDF("device", "purchase_date").select("purchase_date","device").sort("device","purchase_date")
df = [purchase_date: string, device: string]
[purchase_date: string, device: string]
%%dataframe
df
purchase_date | device |
---|---|
2019-01-30 | large_phone |
2019-01-22 | medium_phone |
2019-01-10 | notebook |
2019-01-19 | notebook |
2019-01-15 | small_phone |
2019-01-30 | small_phone |
2019-01-22 | tv_set |
%%dataframe
:read
df.withColumn("shipping_rate",
when($"device"==="tv_set",30)
.when($"device".isin("large_phone","notebook"),20)
.when($"device".isin("small_phone","medium_phone"),10)
.otherwise(25)) // in case we sell other devices
purchase_date | device | shipping_rate |
---|---|---|
2019-01-30 | large_phone | 20 |
2019-01-22 | medium_phone | 10 |
2019-01-10 | notebook | 20 |
2019-01-19 | notebook | 20 |
2019-01-15 | small_phone | 10 |
2019-01-30 | small_phone | 10 |
2019-01-22 | tv_set | 30 |