made some changes to SparkSQLTableDemo.py

cmy113 · cmy113 · commit 3d018ae893b8 · 2021-06-06T13:09:28.000+08:00
1. Use partition
2. Use bucket with sortby
diff --git a/06-SparkSQLTableDemo/SparkSQLTableDemo.py b/06-SparkSQLTableDemo/SparkSQLTableDemo.py
@@ -19,8 +19,25 @@
     spark.sql("CREATE DATABASE IF NOT EXISTS AIRLINE_DB")
     spark.catalog.setCurrentDatabase("AIRLINE_DB")
 
+    # flightTimeParquetDF.write \
+    #     .mode("overwrite") \
+    #     .saveAsTable("flight_data_tbl")
+
+    # Partition by ORIGIN, OP_CARRIER
+    # flightTimeParquetDF.write \
+    #     .mode("overwrite") \
+    #     .partitionBy("ORIGIN", "OP_CARRIER") \
+    #     .saveAsTable("flight_data_tbl")
+
+    # Above implementation will cause too many partition
+    # Lets use bucket instead, choose 5 buckets only, it will be computed based on hash and modulus
+    # Since the unique combination of ORIGIN & OP_CARRIER will fall into same bucket
+    # We will sort it as well
     flightTimeParquetDF.write \
+        .format("csv") \
         .mode("overwrite") \
+        .bucketBy(5, "ORIGIN", "OP_CARRIER") \
+        .sortBy("OP_CARRIER", "ORIGIN") \
         .saveAsTable("flight_data_tbl")
 
     logger.info(spark.catalog.listTables("AIRLINE_DB"))