Skip to content

Commit 7898525

Browse files
authored
Add BigQueryInsertJobOperator (#8868)
* Add BigQueryInsertJobOperator * fixup! Add BigQueryInsertJobOperator * fixup! fixup! Add BigQueryInsertJobOperator * fixup! fixup! fixup! Add BigQueryInsertJobOperator
1 parent 7c0e6ed commit 7898525

File tree

5 files changed

+266
-53
lines changed

5 files changed

+266
-53
lines changed

β€Žairflow/providers/google/cloud/example_dags/example_bigquery_queries.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from airflow.providers.google.cloud.operators.bigquery import (
2828
BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
2929
BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
30-
BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
30+
BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
3131
)
3232
from airflow.utils.dates import days_ago
3333

@@ -40,10 +40,10 @@
4040
INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
4141

4242
# [START howto_operator_bigquery_query]
43-
INSERT_ROWS_QUERY = f"""
44-
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}");
45-
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}");
46-
"""
43+
INSERT_ROWS_QUERY = \
44+
f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
45+
f"(42, 'monthy python', '{INSERT_DATE}'), " \
46+
f"(42, 'fishy fish', '{INSERT_DATE}');"
4747
# [END howto_operator_bigquery_query]
4848

4949
SCHEMA = [
@@ -84,13 +84,22 @@
8484
task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
8585
)
8686

87-
# [START howto_operator_bigquery_execute_query]
87+
# [START howto_operator_bigquery_insert_job]
88+
insert_query_job = BigQueryInsertJobOperator(
89+
task_id="insert_query_job",
90+
configuration={
91+
"query": {
92+
"query": INSERT_ROWS_QUERY,
93+
"useLegacySql": False,
94+
}
95+
},
96+
)
97+
# [END howto_operator_bigquery_insert_job]
98+
8899
execute_insert_query = BigQueryExecuteQueryOperator(
89100
task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
90101
)
91-
# [END howto_operator_bigquery_execute_query]
92102

93-
# [START howto_operator_bigquery_execute_query_list]
94103
bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
95104
task_id="execute_multi_query",
96105
sql=[
@@ -99,16 +108,13 @@
99108
],
100109
use_legacy_sql=False,
101110
)
102-
# [END howto_operator_bigquery_execute_query_list]
103111

104-
# [START howto_operator_bigquery_execute_query_save]
105112
execute_query_save = BigQueryExecuteQueryOperator(
106113
task_id="execute_query_save",
107114
sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
108115
use_legacy_sql=False,
109116
destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
110117
)
111-
# [END howto_operator_bigquery_execute_query_save]
112118

113119
# [START howto_operator_bigquery_get_data]
114120
get_data = BigQueryGetDataOperator(
@@ -137,7 +143,7 @@
137143
check_value = BigQueryValueCheckOperator(
138144
task_id="check_value",
139145
sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
140-
pass_value=2,
146+
pass_value=4,
141147
use_legacy_sql=False,
142148
)
143149
# [END howto_operator_bigquery_value_check]
@@ -152,8 +158,9 @@
152158
)
153159
# [END howto_operator_bigquery_interval_check]
154160

155-
[create_table_1, create_table_2] >> execute_insert_query
161+
[create_table_1, create_table_2] >> insert_query_job
156162

163+
insert_query_job >> execute_insert_query
157164
execute_insert_query >> get_data >> get_data_result >> delete_dataset
158165
execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
159166
execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset

β€Žairflow/providers/google/cloud/hooks/bigquery.py

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,13 +1397,42 @@ def cancel_job(
13971397
self.log.info('Waiting for canceled job with id %s to finish.', job_id)
13981398
time.sleep(5)
13991399

1400+
@GoogleBaseHook.fallback_to_default_project_id
1401+
def get_job(
1402+
self,
1403+
job_id: Optional[str] = None,
1404+
project_id: Optional[str] = None,
1405+
location: Optional[str] = None,
1406+
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
1407+
"""
1408+
Retrives a BigQuery job. For more information see:
1409+
https://cloud.google.com/bigquery/docs/reference/v2/jobs
1410+
1411+
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
1412+
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
1413+
characters. If not provided then uuid will be generated.
1414+
:type job_id: str
1415+
:param project_id: Google Cloud Project where the job is running
1416+
:type project_id: str
1417+
:param location: location the job is running
1418+
:type location: str
1419+
"""
1420+
client = self.get_client(project_id=project_id, location=location)
1421+
job = client.get_job(
1422+
job_id=job_id,
1423+
project=project_id,
1424+
location=location
1425+
)
1426+
return job
1427+
14001428
@GoogleBaseHook.fallback_to_default_project_id
14011429
def insert_job(
14021430
self,
14031431
configuration: Dict,
1432+
job_id: Optional[str] = None,
14041433
project_id: Optional[str] = None,
14051434
location: Optional[str] = None,
1406-
) -> str:
1435+
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
14071436
"""
14081437
Executes a BigQuery job. Waits for the job to complete and returns job id.
14091438
See here:
@@ -1414,17 +1443,23 @@ def insert_job(
14141443
BigQuery's configuration field in the job object. See
14151444
https://cloud.google.com/bigquery/docs/reference/v2/jobs for
14161445
details.
1446+
:type configuration: Dict[str, Any]
1447+
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
1448+
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
1449+
characters. If not provided then uuid will be generated.
1450+
:type job_id: str
14171451
:param project_id: Google Cloud Project where the job is running
14181452
:type project_id: str
14191453
:param location: location the job is running
14201454
:type location: str
14211455
"""
1456+
job_id = job_id or str(uuid.uuid4())
14221457
location = location or self.location
14231458
client = self.get_client(project_id=project_id, location=location)
14241459
job_data = {
14251460
"configuration": configuration,
14261461
"jobReference": {
1427-
"jobId": str(uuid.uuid4()),
1462+
"jobId": job_id,
14281463
"projectId": project_id,
14291464
"location": location
14301465
}
@@ -1446,9 +1481,7 @@ def insert_job(
14461481
if not job:
14471482
raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
14481483
job = job.from_api_repr(job_data, client)
1449-
# Start the job and wait for it to complete and get the result.
1450-
job.result()
1451-
return job.job_id
1484+
return job
14521485

14531486
def run_with_configuration(self, configuration: Dict) -> str:
14541487
"""
@@ -1467,8 +1500,11 @@ def run_with_configuration(self, configuration: Dict) -> str:
14671500
"This method is deprecated. Please use `BigQueryHook.insert_job`",
14681501
DeprecationWarning
14691502
)
1470-
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
1471-
return self.running_job_id
1503+
job = self.insert_job(configuration=configuration, project_id=self.project_id)
1504+
# Start the job and wait for it to complete and get the result.
1505+
job.result()
1506+
self.running_job_id = job.job_id
1507+
return job.job_id
14721508

14731509
def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name
14741510
destination_project_dataset_table: str,
@@ -1709,8 +1745,11 @@ def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid
17091745
if allow_jagged_rows:
17101746
configuration['load']['allowJaggedRows'] = allow_jagged_rows
17111747

1712-
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
1713-
return self.running_job_id
1748+
job = self.insert_job(configuration=configuration, project_id=self.project_id)
1749+
# Start the job and wait for it to complete and get the result.
1750+
job.result()
1751+
self.running_job_id = job.job_id
1752+
return job.job_id
17141753

17151754
def run_copy(self, # pylint: disable=invalid-name
17161755
source_project_dataset_tables: Union[List, str],
@@ -1803,8 +1842,11 @@ def run_copy(self, # pylint: disable=invalid-name
18031842
"destinationEncryptionConfiguration"
18041843
] = encryption_configuration
18051844

1806-
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
1807-
return self.running_job_id
1845+
job = self.insert_job(configuration=configuration, project_id=self.project_id)
1846+
# Start the job and wait for it to complete and get the result.
1847+
job.result()
1848+
self.running_job_id = job.job_id
1849+
return job.job_id
18081850

18091851
def run_extract(
18101852
self,
@@ -1878,8 +1920,9 @@ def run_extract(
18781920
configuration['extract']['fieldDelimiter'] = field_delimiter
18791921
configuration['extract']['printHeader'] = print_header
18801922

1881-
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
1882-
return self.running_job_id
1923+
job = self.insert_job(configuration=configuration, project_id=self.project_id)
1924+
self.running_job_id = job.job_id
1925+
return job.job_id
18831926

18841927
# pylint: disable=too-many-locals,too-many-arguments, too-many-branches
18851928
def run_query(self,
@@ -2123,8 +2166,11 @@ def run_query(self,
21232166
"destinationEncryptionConfiguration"
21242167
] = encryption_configuration
21252168

2126-
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
2127-
return self.running_job_id
2169+
job = self.insert_job(configuration=configuration, project_id=self.project_id)
2170+
# Start the job and wait for it to complete and get the result.
2171+
job.result()
2172+
self.running_job_id = job.job_id
2173+
return job.job_id
21282174

21292175

21302176
class BigQueryPandasConnector(GbqConnector):

β€Žairflow/providers/google/cloud/operators/bigquery.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323
import enum
2424
import json
2525
import warnings
26+
from time import sleep
2627
from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
2728

2829
import attr
2930
from google.api_core.exceptions import Conflict
31+
from google.api_core.retry import exponential_sleep_generator
3032
from google.cloud.bigquery import TableReference
3133

3234
from airflow.exceptions import AirflowException
@@ -546,6 +548,11 @@ def __init__(self,
546548
"the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
547549
gcp_conn_id = bigquery_conn_id
548550

551+
warnings.warn(
552+
"This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
553+
DeprecationWarning, stacklevel=3,
554+
)
555+
549556
self.sql = sql
550557
self.destination_dataset_table = destination_dataset_table
551558
self.write_disposition = write_disposition
@@ -1570,3 +1577,78 @@ def execute(self, context):
15701577
table_resource=self.table_resource,
15711578
project_id=self.project_id,
15721579
)
1580+
1581+
1582+
class BigQueryInsertJobOperator(BaseOperator):
1583+
"""
1584+
Executes a BigQuery job. Waits for the job to complete and returns job id.
1585+
See here:
1586+
1587+
https://cloud.google.com/bigquery/docs/reference/v2/jobs
1588+
1589+
:param configuration: The configuration parameter maps directly to BigQuery's
1590+
configuration field in the job object. For more details see
1591+
https://cloud.google.com/bigquery/docs/reference/v2/jobs
1592+
:type configuration: Dict[str, Any]
1593+
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
1594+
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
1595+
characters. If not provided then uuid will be generated.
1596+
:type job_id: str
1597+
:param project_id: Google Cloud Project where the job is running
1598+
:type project_id: str
1599+
:param location: location the job is running
1600+
:type location: str
1601+
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
1602+
:type gcp_conn_id: str
1603+
"""
1604+
1605+
template_fields = ("configuration", "job_id")
1606+
ui_color = BigQueryUIColors.QUERY.value
1607+
1608+
def __init__(
1609+
self,
1610+
configuration: Dict[str, Any],
1611+
project_id: Optional[str] = None,
1612+
location: Optional[str] = None,
1613+
job_id: Optional[str] = None,
1614+
gcp_conn_id: str = 'google_cloud_default',
1615+
delegate_to: Optional[str] = None,
1616+
*args,
1617+
**kwargs,
1618+
) -> None:
1619+
super().__init__(*args, **kwargs)
1620+
self.configuration = configuration
1621+
self.location = location
1622+
self.job_id = job_id
1623+
self.project_id = project_id
1624+
self.gcp_conn_id = gcp_conn_id
1625+
self.delegate_to = delegate_to
1626+
1627+
def execute(self, context: Any):
1628+
hook = BigQueryHook(
1629+
gcp_conn_id=self.gcp_conn_id,
1630+
delegate_to=self.delegate_to,
1631+
)
1632+
1633+
try:
1634+
job = hook.insert_job(
1635+
configuration=self.configuration,
1636+
project_id=self.project_id,
1637+
location=self.location,
1638+
job_id=self.job_id,
1639+
)
1640+
# Start the job and wait for it to complete and get the result.
1641+
job.result()
1642+
except Conflict:
1643+
job = hook.get_job(
1644+
project_id=self.project_id,
1645+
location=self.location,
1646+
job_id=self.job_id,
1647+
)
1648+
# Get existing job and wait for it to be ready
1649+
for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
1650+
sleep(time_to_wait)
1651+
job.reload()
1652+
if job.done():
1653+
break
1654+
return job.job_id

β€Ždocs/howto/operator/gcp/bigquery.rst

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,10 @@ You can also use this operator to delete a view.
241241
:start-after: [START howto_operator_bigquery_delete_view]
242242
:end-before: [END howto_operator_bigquery_delete_view]
243243

244-
.. _howto/operator:BigQueryExecuteQueryOperator:
244+
.. _howto/operator:BigQueryInsertJobOperator:
245245

246-
Execute queries
247-
^^^^^^^^^^^^^^^
246+
Execute BigQuery jobs
247+
^^^^^^^^^^^^^^^^^^^^^
248248

249249
Let's say you would like to execute the following query.
250250

@@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
255255
:end-before: [END howto_operator_bigquery_query]
256256

257257
To execute the SQL query in a specific BigQuery database you can use
258-
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`.
258+
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
259+
proper query job configuration.
259260

260261
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
261262
:language: python
262263
:dedent: 4
263-
:start-after: [START howto_operator_bigquery_execute_query]
264-
:end-before: [END howto_operator_bigquery_execute_query]
265-
266-
``sql`` argument can receive a str representing a sql statement, a list of str
267-
(sql statements), or reference to a template file. Template reference are recognized
268-
by str ending in '.sql'.
264+
:start-after: [START howto_operator_bigquery_insert_job]
265+
:end-before: [END howto_operator_bigquery_insert_job]
269266

270-
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
271-
:language: python
272-
:dedent: 4
273-
:start-after: [START howto_operator_bigquery_execute_query_list]
274-
:end-before: [END howto_operator_bigquery_execute_query_list]
267+
For more information on types of BigQuery job please check
268+
`documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.
275269

276-
You can store the results of the query in a table by specifying
277-
``destination_dataset_table``.
278-
279-
.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
280-
:language: python
281-
:dedent: 4
282-
:start-after: [START howto_operator_bigquery_execute_query_save]
283-
:end-before: [END howto_operator_bigquery_execute_query_save]
270+
Additionally you can use ``job_id`` parameter of
271+
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
272+
idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
273+
operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
274+
then it will reattach to the existing job.
284275

285276
Validate data
286277
^^^^^^^^^^^^^

0 commit comments

Comments
 (0)