Add BigQueryInsertJobOperator (#8868)

turbaszek · web-flow · commit 78985254683c · 2020-06-01T11:54:38.000+02:00
* Add BigQueryInsertJobOperator

* fixup! Add BigQueryInsertJobOperator

* fixup! fixup! Add BigQueryInsertJobOperator

* fixup! fixup! fixup! Add BigQueryInsertJobOperator
diff --git a/airflow/providers/google/cloud/example_dags/example_bigquery_queries.py b/airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
@@ -27,7 +27,7 @@
 from airflow.providers.google.cloud.operators.bigquery import (
     BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
     BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
-    BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
+    BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
 )
 from airflow.utils.dates import days_ago
 
@@ -40,10 +40,10 @@
 INSERT_DATE = datetime.now().strftime("%Y-%m-%d")
 
 # [START howto_operator_bigquery_query]
-INSERT_ROWS_QUERY = f"""
-INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}");
-INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}");
-"""
+INSERT_ROWS_QUERY = \
+    f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
+    f"(42, 'monthy python', '{INSERT_DATE}'), " \
+    f"(42, 'fishy fish', '{INSERT_DATE}');"
 # [END howto_operator_bigquery_query]
 
 SCHEMA = [
@@ -84,13 +84,22 @@
         task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
     )
 
-    # [START howto_operator_bigquery_execute_query]
+    # [START howto_operator_bigquery_insert_job]
+    insert_query_job = BigQueryInsertJobOperator(
+        task_id="insert_query_job",
+        configuration={
+            "query": {
+                "query": INSERT_ROWS_QUERY,
+                "useLegacySql": False,
+            }
+        },
+    )
+    # [END howto_operator_bigquery_insert_job]
+
     execute_insert_query = BigQueryExecuteQueryOperator(
         task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
     )
-    # [END howto_operator_bigquery_execute_query]
 
-    # [START howto_operator_bigquery_execute_query_list]
     bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
         task_id="execute_multi_query",
         sql=[
@@ -99,16 +108,13 @@
         ],
         use_legacy_sql=False,
     )
-    # [END howto_operator_bigquery_execute_query_list]
 
-    # [START howto_operator_bigquery_execute_query_save]
     execute_query_save = BigQueryExecuteQueryOperator(
         task_id="execute_query_save",
         sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
         use_legacy_sql=False,
         destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
     )
-    # [END howto_operator_bigquery_execute_query_save]
 
     # [START howto_operator_bigquery_get_data]
     get_data = BigQueryGetDataOperator(
@@ -137,7 +143,7 @@
     check_value = BigQueryValueCheckOperator(
         task_id="check_value",
         sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
-        pass_value=2,
+        pass_value=4,
         use_legacy_sql=False,
     )
     # [END howto_operator_bigquery_value_check]
@@ -152,8 +158,9 @@
     )
     # [END howto_operator_bigquery_interval_check]
 
-    [create_table_1, create_table_2] >> execute_insert_query
+    [create_table_1, create_table_2] >> insert_query_job
 
+    insert_query_job >> execute_insert_query
     execute_insert_query >> get_data >> get_data_result >> delete_dataset
     execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
     execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset
diff --git a/airflow/providers/google/cloud/hooks/bigquery.py b/airflow/providers/google/cloud/hooks/bigquery.py
@@ -1397,13 +1397,42 @@ def cancel_job(
                 self.log.info('Waiting for canceled job with id %s to finish.', job_id)
                 time.sleep(5)
 
+    @GoogleBaseHook.fallback_to_default_project_id
+    def get_job(
+        self,
+        job_id: Optional[str] = None,
+        project_id: Optional[str] = None,
+        location: Optional[str] = None,
+    ) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
+        """
+        Retrives a BigQuery job. For more information see:
+        https://cloud.google.com/bigquery/docs/reference/v2/jobs
+
+        :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
+            numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
+            characters. If not provided then uuid will be generated.
+        :type job_id: str
+        :param project_id: Google Cloud Project where the job is running
+        :type project_id: str
+        :param location: location the job is running
+        :type location: str
+        """
+        client = self.get_client(project_id=project_id, location=location)
+        job = client.get_job(
+            job_id=job_id,
+            project=project_id,
+            location=location
+        )
+        return job
+
     @GoogleBaseHook.fallback_to_default_project_id
     def insert_job(
         self,
         configuration: Dict,
+        job_id: Optional[str] = None,
         project_id: Optional[str] = None,
         location: Optional[str] = None,
-    ) -> str:
+    ) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
         """
         Executes a BigQuery job. Waits for the job to complete and returns job id.
         See here:
@@ -1414,17 +1443,23 @@ def insert_job(
             BigQuery's configuration field in the job object. See
             https://cloud.google.com/bigquery/docs/reference/v2/jobs for
             details.
+        :type configuration: Dict[str, Any]
+        :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
+            numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
+            characters. If not provided then uuid will be generated.
+        :type job_id: str
         :param project_id: Google Cloud Project where the job is running
         :type project_id: str
         :param location: location the job is running
         :type location: str
         """
+        job_id = job_id or str(uuid.uuid4())
         location = location or self.location
         client = self.get_client(project_id=project_id, location=location)
         job_data = {
             "configuration": configuration,
             "jobReference": {
-                "jobId": str(uuid.uuid4()),
+                "jobId": job_id,
                 "projectId": project_id,
                 "location": location
             }
@@ -1446,9 +1481,7 @@ def insert_job(
         if not job:
             raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
         job = job.from_api_repr(job_data, client)
-        # Start the job and wait for it to complete and get the result.
-        job.result()
-        return job.job_id
+        return job
 
     def run_with_configuration(self, configuration: Dict) -> str:
         """
@@ -1467,8 +1500,11 @@ def run_with_configuration(self, configuration: Dict) -> str:
             "This method is deprecated. Please use `BigQueryHook.insert_job`",
             DeprecationWarning
         )
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
+        # Start the job and wait for it to complete and get the result.
+        job.result()
+        self.running_job_id = job.job_id
+        return job.job_id
 
     def run_load(self,  # pylint: disable=too-many-locals,too-many-arguments,invalid-name
                  destination_project_dataset_table: str,
@@ -1709,8 +1745,11 @@ def run_load(self,  # pylint: disable=too-many-locals,too-many-arguments,invalid
         if allow_jagged_rows:
             configuration['load']['allowJaggedRows'] = allow_jagged_rows
 
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
+        # Start the job and wait for it to complete and get the result.
+        job.result()
+        self.running_job_id = job.job_id
+        return job.job_id
 
     def run_copy(self,  # pylint: disable=invalid-name
                  source_project_dataset_tables: Union[List, str],
@@ -1803,8 +1842,11 @@ def run_copy(self,  # pylint: disable=invalid-name
                 "destinationEncryptionConfiguration"
             ] = encryption_configuration
 
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
+        # Start the job and wait for it to complete and get the result.
+        job.result()
+        self.running_job_id = job.job_id
+        return job.job_id
 
     def run_extract(
             self,
@@ -1878,8 +1920,9 @@ def run_extract(
             configuration['extract']['fieldDelimiter'] = field_delimiter
             configuration['extract']['printHeader'] = print_header
 
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
+        self.running_job_id = job.job_id
+        return job.job_id
 
     # pylint: disable=too-many-locals,too-many-arguments, too-many-branches
     def run_query(self,
@@ -2123,8 +2166,11 @@ def run_query(self,
                 "destinationEncryptionConfiguration"
             ] = encryption_configuration
 
-        self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
-        return self.running_job_id
+        job = self.insert_job(configuration=configuration, project_id=self.project_id)
+        # Start the job and wait for it to complete and get the result.
+        job.result()
+        self.running_job_id = job.job_id
+        return job.job_id
 
 
 class BigQueryPandasConnector(GbqConnector):
diff --git a/airflow/providers/google/cloud/operators/bigquery.py b/airflow/providers/google/cloud/operators/bigquery.py
@@ -23,10 +23,12 @@
 import enum
 import json
 import warnings
+from time import sleep
 from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union
 
 import attr
 from google.api_core.exceptions import Conflict
+from google.api_core.retry import exponential_sleep_generator
 from google.cloud.bigquery import TableReference
 
 from airflow.exceptions import AirflowException
@@ -546,6 +548,11 @@ def __init__(self,
                 "the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
             gcp_conn_id = bigquery_conn_id
 
+        warnings.warn(
+            "This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
+            DeprecationWarning, stacklevel=3,
+        )
+
         self.sql = sql
         self.destination_dataset_table = destination_dataset_table
         self.write_disposition = write_disposition
@@ -1570,3 +1577,78 @@ def execute(self, context):
             table_resource=self.table_resource,
             project_id=self.project_id,
         )
+
+
+class BigQueryInsertJobOperator(BaseOperator):
+    """
+    Executes a BigQuery job. Waits for the job to complete and returns job id.
+    See here:
+
+        https://cloud.google.com/bigquery/docs/reference/v2/jobs
+
+    :param configuration: The configuration parameter maps directly to BigQuery's
+        configuration field in the job  object. For more details see
+        https://cloud.google.com/bigquery/docs/reference/v2/jobs
+    :type configuration: Dict[str, Any]
+    :param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
+        numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
+        characters. If not provided then uuid will be generated.
+    :type job_id: str
+    :param project_id: Google Cloud Project where the job is running
+    :type project_id: str
+    :param location: location the job is running
+    :type location: str
+    :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
+    :type gcp_conn_id: str
+    """
+
+    template_fields = ("configuration", "job_id")
+    ui_color = BigQueryUIColors.QUERY.value
+
+    def __init__(
+        self,
+        configuration: Dict[str, Any],
+        project_id: Optional[str] = None,
+        location: Optional[str] = None,
+        job_id: Optional[str] = None,
+        gcp_conn_id: str = 'google_cloud_default',
+        delegate_to: Optional[str] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.configuration = configuration
+        self.location = location
+        self.job_id = job_id
+        self.project_id = project_id
+        self.gcp_conn_id = gcp_conn_id
+        self.delegate_to = delegate_to
+
+    def execute(self, context: Any):
+        hook = BigQueryHook(
+            gcp_conn_id=self.gcp_conn_id,
+            delegate_to=self.delegate_to,
+        )
+
+        try:
+            job = hook.insert_job(
+                configuration=self.configuration,
+                project_id=self.project_id,
+                location=self.location,
+                job_id=self.job_id,
+            )
+            # Start the job and wait for it to complete and get the result.
+            job.result()
+        except Conflict:
+            job = hook.get_job(
+                project_id=self.project_id,
+                location=self.location,
+                job_id=self.job_id,
+            )
+            # Get existing job and wait for it to be ready
+            for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
+                sleep(time_to_wait)
+                job.reload()
+                if job.done():
+                    break
+        return job.job_id
diff --git a/docs/howto/operator/gcp/bigquery.rst b/docs/howto/operator/gcp/bigquery.rst
@@ -241,10 +241,10 @@ You can also use this operator to delete a view.
     :start-after: [START howto_operator_bigquery_delete_view]
     :end-before: [END howto_operator_bigquery_delete_view]
 
-.. _howto/operator:BigQueryExecuteQueryOperator:
+.. _howto/operator:BigQueryInsertJobOperator:
 
-Execute queries
-^^^^^^^^^^^^^^^
+Execute BigQuery jobs
+^^^^^^^^^^^^^^^^^^^^^
 
 Let's say you would like to execute the following query.
 
@@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
     :end-before: [END howto_operator_bigquery_query]
 
 To execute the SQL query in a specific BigQuery database you can use
-:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`.
+:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
+proper query job configuration.
 
 .. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
     :language: python
     :dedent: 4
-    :start-after: [START howto_operator_bigquery_execute_query]
-    :end-before: [END howto_operator_bigquery_execute_query]
-
-``sql`` argument can receive a str representing a sql statement, a list of str
-(sql statements), or reference to a template file. Template reference are recognized
-by str ending in '.sql'.
+    :start-after: [START howto_operator_bigquery_insert_job]
+    :end-before: [END howto_operator_bigquery_insert_job]
 
-.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
-    :language: python
-    :dedent: 4
-    :start-after: [START howto_operator_bigquery_execute_query_list]
-    :end-before: [END howto_operator_bigquery_execute_query_list]
+For more information on types of BigQuery job please check
+`documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.
 
-You can store the results of the query in a table by specifying
-``destination_dataset_table``.
-
-.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
-    :language: python
-    :dedent: 4
-    :start-after: [START howto_operator_bigquery_execute_query_save]
-    :end-before: [END howto_operator_bigquery_execute_query_save]
+Additionally you can use ``job_id`` parameter of
+:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
+idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
+operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
+then it will reattach to the existing job.
 
 Validate data
 ^^^^^^^^^^^^^
diff --git a/tests/providers/google/cloud/operators/test_bigquery.py b/tests/providers/google/cloud/operators/test_bigquery.py