Skip to content

Commit c281979

Browse files
rachael-dsrachael-dsmik-laj
authored
Documentation and example dag for CloudDLPDeidentifyContentOperator, GCSObjectExistenceSensor, GCSObjectsWithPrefixExistenceSensor (#14033)
* Add documentation and example dag for: CloudDLPDeidentifyContentOperator, GCSObjectExistenceSensor, GCSObjectsWtihPrefixExistenceSensor * Moving gcs sensor docs and example dags to gcs operators docs/example dags * Add system tests for dlp and gcs * Adding further information on DLPDeidentifyContent operators * Pre-Commit tidyup: Renamed gcs/dlp system tests * Apply suggestions from code review Co-authored-by: Kamil BreguΕ‚a <mik-laj@users.noreply.github.com> * reverting some changes following code review * removed redundant @pytest.mark.system("google.cloud") * removed operators with newly added examples from missing examples list (pytest fix) * updated all references to GCSObjectsWtihPrefixExistenceSensor (typo) to newly fixed: GCSObjectsWithPrefixExistenceSensor * fixing merge issue: including deprecated operator to be excluded from test suite of operators Co-authored-by: rachael-ds <rachael_ds@outlook.com> Co-authored-by: Kamil BreguΕ‚a <mik-laj@users.noreply.github.com>
1 parent b995127 commit c281979

File tree

8 files changed

+152
-16
lines changed

8 files changed

+152
-16
lines changed

β€Žairflow/providers/google/cloud/example_dags/example_dlp.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
CloudDLPCreateInspectTemplateOperator,
3434
CloudDLPCreateJobTriggerOperator,
3535
CloudDLPCreateStoredInfoTypeOperator,
36+
CloudDLPDeidentifyContentOperator,
3637
CloudDLPDeleteInspectTemplateOperator,
3738
CloudDLPDeleteJobTriggerOperator,
3839
CloudDLPDeleteStoredInfoTypeOperator,
@@ -177,3 +178,33 @@
177178
)
178179
# [END howto_operator_dlp_delete_job_trigger]
179180
create_trigger >> update_trigger >> delete_trigger
181+
182+
# [START dlp_deidentify_config_example]
183+
DEIDENTIFY_CONFIG = {
184+
"info_type_transformations": {
185+
"transformations": [
186+
{
187+
"primitive_transformation": {
188+
"replace_config": {"new_value": {"string_value": "[deidentified_number]"}}
189+
}
190+
}
191+
]
192+
}
193+
}
194+
# [END dlp_deidentify_config_example]
195+
196+
with models.DAG(
197+
"example_gcp_dlp_deidentify_content",
198+
schedule_interval=None,
199+
start_date=days_ago(1),
200+
tags=["example", "dlp", "deidentify"],
201+
) as dag4:
202+
# [START _howto_operator_dlp_deidentify_content]
203+
deidentify_content = CloudDLPDeidentifyContentOperator(
204+
project_id=GCP_PROJECT,
205+
item=ITEM,
206+
deidentify_config=DEIDENTIFY_CONFIG,
207+
inspect_config=INSPECT_CONFIG,
208+
task_id="deidentify_content",
209+
)
210+
# [END _howto_operator_dlp_deidentify_content]

β€Žairflow/providers/google/cloud/example_dags/example_gcs.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
GCSListObjectsOperator,
3333
GCSObjectCreateAclEntryOperator,
3434
)
35+
from airflow.providers.google.cloud.sensors.gcs import (
36+
GCSObjectExistenceSensor,
37+
GCSObjectsWithPrefixExistenceSensor,
38+
)
3539
from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator
3640
from airflow.providers.google.cloud.transfers.gcs_to_local import GCSToLocalFilesystemOperator
3741
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
@@ -48,6 +52,7 @@
4852

4953
PATH_TO_TRANSFORM_SCRIPT = os.environ.get('GCP_GCS_PATH_TO_TRANSFORM_SCRIPT', 'test.py')
5054
PATH_TO_UPLOAD_FILE = os.environ.get("GCP_GCS_PATH_TO_UPLOAD_FILE", "test-gcs-example.txt")
55+
PATH_TO_UPLOAD_FILE_PREFIX = os.environ.get("GCP_GCS_PATH_TO_UPLOAD_FILE_PREFIX", "test-gcs-")
5156
PATH_TO_SAVED_FILE = os.environ.get("GCP_GCS_PATH_TO_SAVED_FILE", "test-gcs-example-download.txt")
5257

5358
BUCKET_FILE_LOCATION = PATH_TO_UPLOAD_FILE.rpartition("/")[-1]
@@ -151,6 +156,41 @@
151156
copy_file >> delete_bucket_2
152157
delete_files >> delete_bucket_1
153158

159+
with models.DAG(
160+
"example_gcs_sensors",
161+
start_date=days_ago(1),
162+
schedule_interval=None,
163+
tags=['example'],
164+
) as dag2:
165+
create_bucket = GCSCreateBucketOperator(
166+
task_id="create_bucket", bucket_name=BUCKET_1, project_id=PROJECT_ID
167+
)
168+
upload_file = LocalFilesystemToGCSOperator(
169+
task_id="upload_file",
170+
src=PATH_TO_UPLOAD_FILE,
171+
dst=BUCKET_FILE_LOCATION,
172+
bucket=BUCKET_1,
173+
)
174+
# [START howto_sensor_object_exists_task]
175+
gcs_object_exists = GCSObjectExistenceSensor(
176+
bucket=BUCKET_1,
177+
object=PATH_TO_UPLOAD_FILE,
178+
mode='poke',
179+
task_id="gcs_object_exists_task",
180+
)
181+
# [END howto_sensor_object_exists_task]
182+
# [START howto_sensor_object_with_prefix_exists_task]
183+
gcs_object_with_prefix_exists = GCSObjectsWithPrefixExistenceSensor(
184+
bucket=BUCKET_1,
185+
prefix=PATH_TO_UPLOAD_FILE_PREFIX,
186+
mode='poke',
187+
task_id="gcs_object_with_prefix_exists_task",
188+
)
189+
# [END howto_sensor_object_with_prefix_exists_task]
190+
delete_bucket = GCSDeleteBucketOperator(task_id="delete_bucket", bucket_name=BUCKET_1)
191+
192+
create_bucket >> upload_file >> [gcs_object_exists, gcs_object_with_prefix_exists] >> delete_bucket
193+
154194

155195
if __name__ == '__main__':
156196
dag.clear(dag_run_state=State.NONE)

β€Ždocs/apache-airflow-providers-google/operators/cloud/data_loss_prevention.rst

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,10 +296,25 @@ Unlike storage methods (Jobs) content method are synchronous, stateless methods.
296296

297297
De-identify Content
298298
"""""""""""""""""""
299+
De-identification is the process of removing identifying information from data.
300+
Configuration information defines how you want the sensitive data de-identified.
299301

300-
To de-identify potentially sensitive info from a content item, you can use
302+
This config can either be saved and persisted in de-identification templates or defined in a :class:`~google.cloud.dlp_v2.types.DeidentifyConfig` object:
303+
304+
.. literalinclude:: /../../airflow/providers/google/cloud/example_dags/example_dlp.py
305+
:language: python
306+
:start-after: [START dlp_deidentify_config_example]
307+
:end-before: [END dlp_deidentify_config_example]
308+
309+
To de-identify potentially sensitive information from a content item, you can use
301310
:class:`~airflow.providers.google.cloud.operators.cloud.dlp.CloudDLPDeidentifyContentOperator`.
302311

312+
.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dlp.py
313+
:language: python
314+
:dedent: 4
315+
:start-after: [START _howto_operator_dlp_deidentify_content]
316+
:end-before: [END _howto_operator_dlp_deidentify_content]
317+
303318
.. _howto/operator:CloudDLPReidentifyContentOperator:
304319

305320
Re-identify Content

β€Ždocs/apache-airflow-providers-google/operators/cloud/gcs.rst

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
Google Cloud Storage Operators
2121
==============================
2222

23+
Cloud Storage allows world-wide storage and retrieval of any amount of data at any time.
24+
You can use Cloud Storage for a range of scenarios including serving website content,
25+
storing data for archival and disaster recovery, or distributing large data objects to users via direct download.
26+
2327
.. contents::
2428
:depth: 1
2529
:local:
@@ -29,6 +33,9 @@ Prerequisite Tasks
2933

3034
.. include::/operators/_partials/prerequisite_tasks.rst
3135
36+
Operators
37+
^^^^^^^^^
38+
3239
.. _howto/operator:GCSToBigQueryOperator:
3340

3441
GCSToBigQueryOperator
@@ -111,13 +118,6 @@ More information
111118
See Google Cloud Storage insert documentation to `create a ACL entry for ObjectAccess
112119
<https://cloud.google.com/storage/docs/json_api/v1/objectAccessControls/insert>`_.
113120

114-
Reference
115-
---------
116-
117-
For further information, look at:
118-
119-
* `Client Library Documentation <https://googleapis.github.io/google-cloud-python/latest/storage/index.html>`__
120-
* `Product Documentation <https://cloud.google.com/storage/docs/>`__
121121

122122
.. _howto/operator:GCSDeleteBucketOperator:
123123

@@ -134,14 +134,60 @@ It is performed through the
134134
:start-after: [START howto_operator_gcs_delete_bucket]
135135
:end-before: [END howto_operator_gcs_delete_bucket]
136136

137+
137138
You can use :ref:`Jinja templating <jinja-templating>` with
138139
:template-fields:`airflow.providers.google.cloud.operators.gcs.GCSDeleteBucketOperator`
139140
parameters which allows you to dynamically determine values.
140141

141142
Reference
142-
^^^^^^^^^
143+
---------
143144

144145
For further information, look at:
145146

146147
* `Client Library Documentation <https://googleapis.dev/python/storage/latest/buckets.html>`__
147148
* `Product Documentation <https://cloud.google.com/storage/docs/json_api/v1/buckets>`__
149+
150+
Sensors
151+
^^^^^^^
152+
153+
.. _howto/sensor:GCSObjectExistenceSensor:
154+
155+
GCSObjectExistenceSensor
156+
------------------------
157+
158+
Use the :class:`~airflow.providers.google.cloud.sensors.gcs.GCSObjectExistenceSensor` to wait (poll) for the existence of a file in Google Cloud Storage.
159+
160+
.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_gcs.py
161+
:language: python
162+
:dedent: 4
163+
:start-after: [START howto_sensor_object_exists_task]
164+
:end-before: [END howto_sensor_object_exists_task]
165+
166+
.. _howto/sensor:GCSObjectsWithPrefixExistenceSensor:
167+
168+
GCSObjectsWithPrefixExistenceSensor
169+
-----------------------------------
170+
171+
Use the :class:`~airflow.providers.google.cloud.sensors.gcs.GCSObjectsWithPrefixExistenceSensor` to wait (poll) for the existence of a file with a specified prefix in Google Cloud Storage.
172+
173+
.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_gcs.py
174+
:language: python
175+
:dedent: 4
176+
:start-after: [START howto_sensor_object_with_prefix_exists_task]
177+
:end-before: [END howto_sensor_object_with_prefix_exists_task]
178+
179+
More information
180+
""""""""""""""""
181+
182+
Sensors have different modes that determine the behaviour of resources while the task is executing.
183+
See `Airflow sensors documentation
184+
<https://airflow.apache.org/docs/apache-airflow/stable/concepts.html#sensors>`_ for best practices when using sensors.
185+
186+
187+
Reference
188+
^^^^^^^^^
189+
190+
For further information, look at:
191+
192+
* `Client Library Documentation <https://googleapis.github.io/google-cloud-python/latest/storage/index.html>`__
193+
* `Product Documentation <https://cloud.google.com/storage/docs/>`__

β€Ždocs/conf.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,7 @@
152152
'README.rst',
153153
]
154154
elif PACKAGE_NAME.startswith('apache-airflow-providers-'):
155-
exclude_patterns = [
156-
'operators/_partials',
157-
]
155+
exclude_patterns = ['operators/_partials']
158156
else:
159157
exclude_patterns = []
160158

β€Žtests/always/test_project_structure.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ class TestGoogleProviderProjectStructure(unittest.TestCase):
169169
# Deprecated operator. Ignore it.
170170
'airflow.providers.google.cloud.operators.cloud_storage_transfer_service'
171171
'.CloudDataTransferServiceGCSToGCSOperator',
172+
# Deprecated operator. Ignore it.
173+
'airflow.providers.google.cloud.sensors.gcs.GCSObjectsWtihPrefixExistenceSensor',
172174
# Base operator. Ignore it.
173175
'airflow.providers.google.cloud.operators.cloud_sql.CloudSQLBaseOperator',
174176
# Deprecated operator. Ignore it
@@ -198,7 +200,6 @@ class TestGoogleProviderProjectStructure(unittest.TestCase):
198200
'airflow.providers.google.cloud.operators.dlp.CloudDLPCreateDeidentifyTemplateOperator',
199201
'airflow.providers.google.cloud.operators.dlp.CloudDLPCreateDLPJobOperator',
200202
'airflow.providers.google.cloud.operators.dlp.CloudDLPUpdateDeidentifyTemplateOperator',
201-
'airflow.providers.google.cloud.operators.dlp.CloudDLPDeidentifyContentOperator',
202203
'airflow.providers.google.cloud.operators.dlp.CloudDLPGetDLPJobTriggerOperator',
203204
'airflow.providers.google.cloud.operators.dlp.CloudDLPListDeidentifyTemplatesOperator',
204205
'airflow.providers.google.cloud.operators.dlp.CloudDLPGetDeidentifyTemplateOperator',
@@ -218,10 +219,7 @@ class TestGoogleProviderProjectStructure(unittest.TestCase):
218219
'airflow.providers.google.cloud.operators.datastore.CloudDatastoreGetOperationOperator',
219220
# Base operator. Ignore it
220221
'airflow.providers.google.cloud.operators.compute.ComputeEngineBaseOperator',
221-
'airflow.providers.google.cloud.sensors.gcs.GCSObjectExistenceSensor',
222222
'airflow.providers.google.cloud.sensors.gcs.GCSObjectUpdateSensor',
223-
'airflow.providers.google.cloud.sensors.gcs.GCSObjectsWithPrefixExistenceSensor',
224-
'airflow.providers.google.cloud.sensors.gcs.GCSObjectsWtihPrefixExistenceSensor',
225223
'airflow.providers.google.cloud.sensors.gcs.GCSUploadSessionCompleteSensor',
226224
}
227225

β€Žtests/providers/google/cloud/operators/test_dlp_system.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,7 @@ def test_run_example_info_types(self):
5151
@provide_gcp_context(GCP_DLP_KEY)
5252
def test_run_example_dlp_job(self):
5353
self.run_dag('example_gcp_dlp_job', CLOUD_DAG_FOLDER)
54+
55+
@provide_gcp_context(GCP_DLP_KEY)
56+
def test_run_example_dlp_deidentify_content(self):
57+
self.run_dag('example_gcp_dlp_deidentify_content', CLOUD_DAG_FOLDER)

β€Žtests/providers/google/cloud/operators/test_gcs_system.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,7 @@ def tearDown(self):
4141
@provide_gcp_context(GCP_GCS_KEY)
4242
def test_run_example_dag(self):
4343
self.run_dag('example_gcs', CLOUD_DAG_FOLDER)
44+
45+
@provide_gcp_context(GCP_GCS_KEY)
46+
def test_run_example_gcs_sensor_dag(self):
47+
self.run_dag('example_gcs_sensors', CLOUD_DAG_FOLDER)

0 commit comments

Comments
 (0)