19
19
20
20
from unittest import mock
21
21
22
+ import pytest
23
+
22
24
from airflow .providers .google .cloud .transfers .s3_to_gcs import S3ToGCSOperator
23
25
24
26
TASK_ID = "test-s3-gcs-operator"
25
27
S3_BUCKET = "test-bucket"
26
28
S3_PREFIX = "TEST"
27
29
S3_DELIMITER = "/"
28
- GCS_PATH_PREFIX = "gs://gcs-bucket/data/"
29
- MOCK_FILES = ["TEST1.csv" , "TEST2.csv" , "TEST3.csv" ]
30
+ GCS_BUCKET = "gcs-bucket"
31
+ GCS_BUCKET_URI = "gs://" + GCS_BUCKET
32
+ GCS_PREFIX = "data/"
33
+ GCS_PATH_PREFIX = GCS_BUCKET_URI + "/" + GCS_PREFIX
34
+ MOCK_FILE_1 = "TEST1.csv"
35
+ MOCK_FILE_2 = "TEST2.csv"
36
+ MOCK_FILE_3 = "TEST3.csv"
37
+ MOCK_FILES = [MOCK_FILE_1 , MOCK_FILE_2 , MOCK_FILE_3 ]
30
38
AWS_CONN_ID = "aws_default"
31
39
GCS_CONN_ID = "google_cloud_default"
32
40
IMPERSONATION_CHAIN = ["ACCOUNT_1" , "ACCOUNT_2" , "ACCOUNT_3" ]
41
+ APPLY_GCS_PREFIX = False
42
+ PARAMETRIZED_OBJECT_PATHS = (
43
+ "apply_gcs_prefix, s3_prefix, s3_object, gcs_destination, gcs_object" ,
44
+ [
45
+ (False , "" , MOCK_FILE_1 , GCS_PATH_PREFIX , GCS_PREFIX + MOCK_FILE_1 ),
46
+ (False , S3_PREFIX , MOCK_FILE_1 , GCS_PATH_PREFIX , GCS_PREFIX + S3_PREFIX + MOCK_FILE_1 ),
47
+ (False , "" , MOCK_FILE_1 , GCS_BUCKET_URI , MOCK_FILE_1 ),
48
+ (False , S3_PREFIX , MOCK_FILE_1 , GCS_BUCKET_URI , S3_PREFIX + MOCK_FILE_1 ),
49
+ (True , "" , MOCK_FILE_1 , GCS_PATH_PREFIX , GCS_PREFIX + MOCK_FILE_1 ),
50
+ (True , S3_PREFIX , MOCK_FILE_1 , GCS_PATH_PREFIX , GCS_PREFIX + MOCK_FILE_1 ),
51
+ (True , "" , MOCK_FILE_1 , GCS_BUCKET_URI , MOCK_FILE_1 ),
52
+ (True , S3_PREFIX , MOCK_FILE_1 , GCS_BUCKET_URI , MOCK_FILE_1 ),
53
+ ],
54
+ )
33
55
34
56
35
57
class TestS3ToGoogleCloudStorageOperator :
@@ -44,6 +66,7 @@ def test_init(self):
44
66
gcp_conn_id = GCS_CONN_ID ,
45
67
dest_gcs = GCS_PATH_PREFIX ,
46
68
google_impersonation_chain = IMPERSONATION_CHAIN ,
69
+ apply_gcs_prefix = APPLY_GCS_PREFIX ,
47
70
)
48
71
49
72
assert operator .task_id == TASK_ID
@@ -53,6 +76,7 @@ def test_init(self):
53
76
assert operator .gcp_conn_id == GCS_CONN_ID
54
77
assert operator .dest_gcs == GCS_PATH_PREFIX
55
78
assert operator .google_impersonation_chain == IMPERSONATION_CHAIN
79
+ assert operator .apply_gcs_prefix == APPLY_GCS_PREFIX
56
80
57
81
@mock .patch ("airflow.providers.google.cloud.transfers.s3_to_gcs.S3Hook" )
58
82
@mock .patch ("airflow.providers.amazon.aws.operators.s3.S3Hook" )
@@ -73,12 +97,12 @@ def test_execute(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook):
73
97
s3_one_mock_hook .return_value .list_keys .return_value = MOCK_FILES
74
98
s3_two_mock_hook .return_value .list_keys .return_value = MOCK_FILES
75
99
76
- uploaded_files = operator .execute (None )
100
+ uploaded_files = operator .execute (context = {} )
77
101
gcs_mock_hook .return_value .upload .assert_has_calls (
78
102
[
79
- mock .call ("gcs-bucket" , "data/TEST1.csv" , mock .ANY , gzip = False ),
80
- mock .call ("gcs-bucket" , "data/TEST3.csv" , mock .ANY , gzip = False ),
81
- mock .call ("gcs-bucket" , "data/TEST2.csv" , mock .ANY , gzip = False ),
103
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_1 , mock .ANY , gzip = False ),
104
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_2 , mock .ANY , gzip = False ),
105
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_3 , mock .ANY , gzip = False ),
82
106
],
83
107
any_order = True ,
84
108
)
@@ -112,16 +136,118 @@ def test_execute_with_gzip(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_ho
112
136
s3_one_mock_hook .return_value .list_keys .return_value = MOCK_FILES
113
137
s3_two_mock_hook .return_value .list_keys .return_value = MOCK_FILES
114
138
115
- operator .execute (None )
139
+ operator .execute (context = {} )
116
140
gcs_mock_hook .assert_called_once_with (
117
141
gcp_conn_id = GCS_CONN_ID ,
118
142
impersonation_chain = None ,
119
143
)
120
144
gcs_mock_hook .return_value .upload .assert_has_calls (
121
145
[
122
- mock .call ("gcs-bucket" , "data/TEST2.csv" , mock .ANY , gzip = True ),
123
- mock .call ("gcs-bucket" , "data/TEST1.csv" , mock .ANY , gzip = True ),
124
- mock .call ("gcs-bucket" , "data/TEST3.csv" , mock .ANY , gzip = True ),
146
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_1 , mock .ANY , gzip = True ),
147
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_2 , mock .ANY , gzip = True ),
148
+ mock .call (GCS_BUCKET , GCS_PREFIX + MOCK_FILE_3 , mock .ANY , gzip = True ),
125
149
],
126
150
any_order = True ,
127
151
)
152
+
153
+ @pytest .mark .parametrize (
154
+ "source_objects, existing_objects, objects_expected" ,
155
+ [
156
+ (MOCK_FILES , [], MOCK_FILES ),
157
+ (MOCK_FILES , [MOCK_FILE_1 ], [MOCK_FILE_2 , MOCK_FILE_3 ]),
158
+ (MOCK_FILES , [MOCK_FILE_1 , MOCK_FILE_2 ], [MOCK_FILE_3 ]),
159
+ (MOCK_FILES , [MOCK_FILE_3 , MOCK_FILE_2 ], [MOCK_FILE_1 ]),
160
+ (MOCK_FILES , MOCK_FILES , []),
161
+ ],
162
+ )
163
+ @mock .patch ("airflow.providers.google.cloud.transfers.s3_to_gcs.GCSHook" )
164
+ def test_exclude_existing_objects (
165
+ self , mock_gcs_hook , source_objects , existing_objects , objects_expected
166
+ ):
167
+ operator = S3ToGCSOperator (
168
+ task_id = TASK_ID ,
169
+ bucket = S3_BUCKET ,
170
+ prefix = S3_PREFIX ,
171
+ delimiter = S3_DELIMITER ,
172
+ gcp_conn_id = GCS_CONN_ID ,
173
+ dest_gcs = GCS_PATH_PREFIX ,
174
+ gzip = True ,
175
+ )
176
+ mock_gcs_hook .list .return_value = existing_objects
177
+ files_reduced = operator .exclude_existing_objects (s3_objects = source_objects , gcs_hook = mock_gcs_hook )
178
+ assert set (files_reduced ) == set (objects_expected )
179
+
180
+ @pytest .mark .parametrize (* PARAMETRIZED_OBJECT_PATHS )
181
+ def test_s3_to_gcs_object (self , apply_gcs_prefix , s3_prefix , s3_object , gcs_destination , gcs_object ):
182
+ operator = S3ToGCSOperator (
183
+ task_id = TASK_ID ,
184
+ bucket = S3_BUCKET ,
185
+ prefix = s3_prefix ,
186
+ delimiter = S3_DELIMITER ,
187
+ gcp_conn_id = GCS_CONN_ID ,
188
+ dest_gcs = gcs_destination ,
189
+ gzip = True ,
190
+ apply_gcs_prefix = apply_gcs_prefix ,
191
+ )
192
+ assert operator .s3_to_gcs_object (s3_object = s3_prefix + s3_object ) == gcs_object
193
+
194
+ @pytest .mark .parametrize (* PARAMETRIZED_OBJECT_PATHS )
195
+ def test_gcs_to_s3_object (self , apply_gcs_prefix , s3_prefix , s3_object , gcs_destination , gcs_object ):
196
+ operator = S3ToGCSOperator (
197
+ task_id = TASK_ID ,
198
+ bucket = S3_BUCKET ,
199
+ prefix = s3_prefix ,
200
+ delimiter = S3_DELIMITER ,
201
+ gcp_conn_id = GCS_CONN_ID ,
202
+ dest_gcs = gcs_destination ,
203
+ gzip = True ,
204
+ apply_gcs_prefix = apply_gcs_prefix ,
205
+ )
206
+ assert operator .gcs_to_s3_object (gcs_object = gcs_object ) == s3_prefix + s3_object
207
+
208
+ @pytest .mark .parametrize (* PARAMETRIZED_OBJECT_PATHS )
209
+ @mock .patch ("airflow.providers.google.cloud.transfers.s3_to_gcs.S3Hook" )
210
+ @mock .patch ("airflow.providers.amazon.aws.operators.s3.S3Hook" )
211
+ @mock .patch ("airflow.providers.google.cloud.transfers.s3_to_gcs.GCSHook" )
212
+ def test_execute_apply_gcs_prefix (
213
+ self ,
214
+ gcs_mock_hook ,
215
+ s3_one_mock_hook ,
216
+ s3_two_mock_hook ,
217
+ apply_gcs_prefix ,
218
+ s3_prefix ,
219
+ s3_object ,
220
+ gcs_destination ,
221
+ gcs_object ,
222
+ ):
223
+
224
+ operator = S3ToGCSOperator (
225
+ task_id = TASK_ID ,
226
+ bucket = S3_BUCKET ,
227
+ prefix = s3_prefix ,
228
+ delimiter = S3_DELIMITER ,
229
+ gcp_conn_id = GCS_CONN_ID ,
230
+ dest_gcs = gcs_destination ,
231
+ google_impersonation_chain = IMPERSONATION_CHAIN ,
232
+ apply_gcs_prefix = apply_gcs_prefix ,
233
+ )
234
+
235
+ s3_one_mock_hook .return_value .list_keys .return_value = [s3_prefix + s3_object ]
236
+ s3_two_mock_hook .return_value .list_keys .return_value = [s3_prefix + s3_object ]
237
+
238
+ uploaded_files = operator .execute (context = {})
239
+ gcs_mock_hook .return_value .upload .assert_has_calls (
240
+ [
241
+ mock .call (GCS_BUCKET , gcs_object , mock .ANY , gzip = False ),
242
+ ],
243
+ any_order = True ,
244
+ )
245
+
246
+ s3_one_mock_hook .assert_called_once_with (aws_conn_id = AWS_CONN_ID , verify = None )
247
+ s3_two_mock_hook .assert_called_once_with (aws_conn_id = AWS_CONN_ID , verify = None )
248
+ gcs_mock_hook .assert_called_once_with (
249
+ gcp_conn_id = GCS_CONN_ID ,
250
+ impersonation_chain = IMPERSONATION_CHAIN ,
251
+ )
252
+
253
+ assert sorted ([s3_prefix + s3_object ]) == sorted (uploaded_files )
0 commit comments