|
19 | 19 | ("Rosy", "7", "8", "63"), # 1963
|
20 | 20 | ("Abdul", "23", "5", "81")] # 1981
|
21 | 21 |
|
| 22 | + # createDataFrame 新建 |
22 | 23 | raw_df = spark.createDataFrame(data_list).toDF("name", "day", "month", "year").repartition(3)
|
23 | 24 | raw_df.printSchema()
|
24 | 25 |
|
| 26 | + # know the use of cast() |
25 | 27 | final_df = raw_df.withColumn("id", monotonically_increasing_id()) \
|
26 | 28 | .withColumn("day", col("day").cast(IntegerType())) \
|
27 | 29 | .withColumn("month", col("month").cast(IntegerType())) \
|
28 | 30 | .withColumn("year", col("year").cast(IntegerType())) \
|
29 |
| - .withColumn("year", when(col("year") < 20, col("year") + 2000) |
| 31 | + .withColumn("year", \ |
| 32 | + when(col("year") < 20, col("year") + 2000) |
30 | 33 | .when(col("year") < 100, col("year") + 1900)
|
31 | 34 | .otherwise(col("year"))) \
|
32 | 35 | .withColumn("dob", expr("to_date(concat(day,'/',month,'/',year), 'd/M/y')")) \
|
33 | 36 | .drop("day", "month", "year") \
|
34 | 37 | .dropDuplicates(["name", "dob"]) \
|
35 |
| - # .sort(expr("dob desc")) This doesn't seem to be working |
36 | 38 | .sort(col("dob").desc())
|
| 39 | + # .sort(expr("dob desc")) This doesn't seem to be working |
37 | 40 |
|
38 |
| - final_df.show() |
| 41 | +final_df.show() |
0 commit comments