File tree Expand file tree Collapse file tree 1 file changed +17
-0
lines changed Expand file tree Collapse file tree 1 file changed +17
-0
lines changed Original file line number Diff line number Diff line change 19
19
spark .sql ("CREATE DATABASE IF NOT EXISTS AIRLINE_DB" )
20
20
spark .catalog .setCurrentDatabase ("AIRLINE_DB" )
21
21
22
+ # flightTimeParquetDF.write \
23
+ # .mode("overwrite") \
24
+ # .saveAsTable("flight_data_tbl")
25
+
26
+ # Partition by ORIGIN, OP_CARRIER
27
+ # flightTimeParquetDF.write \
28
+ # .mode("overwrite") \
29
+ # .partitionBy("ORIGIN", "OP_CARRIER") \
30
+ # .saveAsTable("flight_data_tbl")
31
+
32
+ # Above implementation will cause too many partition
33
+ # Lets use bucket instead, choose 5 buckets only, it will be computed based on hash and modulus
34
+ # Since the unique combination of ORIGIN & OP_CARRIER will fall into same bucket
35
+ # We will sort it as well
22
36
flightTimeParquetDF .write \
37
+ .format ("csv" ) \
23
38
.mode ("overwrite" ) \
39
+ .bucketBy (5 , "ORIGIN" , "OP_CARRIER" ) \
40
+ .sortBy ("OP_CARRIER" , "ORIGIN" ) \
24
41
.saveAsTable ("flight_data_tbl" )
25
42
26
43
logger .info (spark .catalog .listTables ("AIRLINE_DB" ))
You can’t perform that action at this time.
0 commit comments