-
Notifications
You must be signed in to change notification settings - Fork 909
/
Copy pathpyspark-datediff.py
40 lines (31 loc) · 1.25 KB
/
pyspark-datediff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])
from pyspark.sql.functions import *
df.select(
col("date"),
current_date().alias("current_date"),
datediff(current_date(),col("date")).alias("datediff")
).show()
df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
.withColumn("montsDiff", months_between(current_date(),col("date"))) \
.withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
.withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
.withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12),2)) \
.show()
data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]
df2=spark.createDataFrame(data=data2,schema=["id","date"])
df2.select(
to_date(col("date"),"MM-dd-yyyy").alias("date"),
current_date().alias("endDate")
)
#SQL
spark.sql("select round(months_between('2019-07-01',current_date())/12,2) as years_diff").show()