Skip to content

Commit b777514

Browse files
danielvdendeDaniel van der Ende
andauthored
Add handling state of existing Dataproc batch (#24924)
This change avoids Airflow marking tasks as 'Success' even if the existing Batch is in a 'Failed' state. We check the various states, and ensure that the Airflow task state reflects the actual state of the Dataproc Batch. Co-authored-by: Daniel van der Ende <daniel.van.der.ende@mollie.com>
1 parent 1c7a4ac commit b777514

File tree

2 files changed

+46
-0
lines changed

2 files changed

+46
-0
lines changed

β€Žairflow/providers/google/cloud/operators/dataproc.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,22 @@ def execute(self, context: 'Context'):
20492049
timeout=self.timeout,
20502050
metadata=self.metadata,
20512051
)
2052+
2053+
# The existing batch may be a in a number of states other than 'SUCCEEDED'
2054+
if result.state != Batch.State.SUCCEEDED:
2055+
if result.state == Batch.State.FAILED or result.state == Batch.State.CANCELLED:
2056+
raise AirflowException(
2057+
f"Existing Batch {self.batch_id} failed or cancelled. "
2058+
f"Error: {result.state_message}"
2059+
)
2060+
else:
2061+
# Batch state is either: RUNNING, PENDING, CANCELLING, or UNSPECIFIED
2062+
self.log.info(
2063+
f"Batch {self.batch_id} is in state {result.state.name}."
2064+
"Waiting for state change..."
2065+
)
2066+
result = hook.wait_for_operation(timeout=self.timeout, operation=result)
2067+
20522068
batch_id = self.batch_id or result.name.split('/')[-1]
20532069
DataprocLink.persist(context=context, task_instance=self, url=DATAPROC_BATCH_LINK, resource=batch_id)
20542070
return Batch.to_dict(result)

β€Žtests/providers/google/cloud/operators/test_dataproc.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pytest
2424
from google.api_core.exceptions import AlreadyExists, NotFound
2525
from google.api_core.retry import Retry
26+
from google.cloud.dataproc_v1 import Batch
2627

2728
from airflow import AirflowException
2829
from airflow.exceptions import AirflowTaskTimeout
@@ -1658,6 +1659,35 @@ def test_execute(self, mock_hook, to_dict_mock):
16581659
metadata=METADATA,
16591660
)
16601661

1662+
@mock.patch(DATAPROC_PATH.format("Batch.to_dict"))
1663+
@mock.patch(DATAPROC_PATH.format("DataprocHook"))
1664+
def test_execute_batch_failed(self, mock_hook, to_dict_mock):
1665+
op = DataprocCreateBatchOperator(
1666+
task_id=TASK_ID,
1667+
gcp_conn_id=GCP_CONN_ID,
1668+
impersonation_chain=IMPERSONATION_CHAIN,
1669+
region=GCP_LOCATION,
1670+
project_id=GCP_PROJECT,
1671+
batch=BATCH,
1672+
batch_id=BATCH_ID,
1673+
request_id=REQUEST_ID,
1674+
retry=RETRY,
1675+
timeout=TIMEOUT,
1676+
metadata=METADATA,
1677+
)
1678+
mock_hook.return_value.create_batch.side_effect = AlreadyExists("")
1679+
mock_hook.return_value.get_batch.return_value.state = Batch.State.FAILED
1680+
with pytest.raises(AirflowException):
1681+
op.execute(context=MagicMock())
1682+
mock_hook.return_value.get_batch.assert_called_once_with(
1683+
batch_id=BATCH_ID,
1684+
region=GCP_LOCATION,
1685+
project_id=GCP_PROJECT,
1686+
retry=RETRY,
1687+
timeout=TIMEOUT,
1688+
metadata=METADATA,
1689+
)
1690+
16611691

16621692
class TestDataprocDeleteBatchOperator:
16631693
@mock.patch(DATAPROC_PATH.format("DataprocHook"))

0 commit comments

Comments
 (0)