Skip to content

Commit dc03000

Browse files
author
Wojciech Januszek
authored
Datafusion assets (#21518)
1 parent 56365b1 commit dc03000

File tree

4 files changed

+242
-56
lines changed

4 files changed

+242
-56
lines changed

β€Žairflow/providers/google/cloud/example_dags/example_datafusion.py

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -55,30 +55,31 @@
5555

5656
PIPELINE_NAME = os.environ.get("GCP_DATAFUSION_PIPELINE_NAME", "airflow_test")
5757
PIPELINE = {
58-
"name": "test-pipe",
58+
"artifact": {
59+
"name": "cdap-data-pipeline",
60+
"version": "6.5.1",
61+
"scope": "SYSTEM",
62+
"label": "Data Pipeline - System Test",
63+
},
5964
"description": "Data Pipeline Application",
60-
"artifact": {"name": "cdap-data-pipeline", "version": "6.4.1", "scope": "SYSTEM"},
65+
"name": "test-pipe",
6166
"config": {
6267
"resources": {"memoryMB": 2048, "virtualCores": 1},
6368
"driverResources": {"memoryMB": 2048, "virtualCores": 1},
6469
"connections": [{"from": "GCS", "to": "GCS2"}],
6570
"comments": [],
6671
"postActions": [],
6772
"properties": {},
68-
"processTimingEnabled": True,
69-
"stageLoggingEnabled": False,
73+
"processTimingEnabled": "true",
74+
"stageLoggingEnabled": "false",
7075
"stages": [
7176
{
7277
"name": "GCS",
7378
"plugin": {
7479
"name": "GCSFile",
7580
"type": "batchsource",
7681
"label": "GCS",
77-
"artifact": {
78-
"name": "google-cloud",
79-
"version": "0.17.3",
80-
"scope": "SYSTEM",
81-
},
82+
"artifact": {"name": "google-cloud", "version": "0.18.1", "scope": "SYSTEM"},
8283
"properties": {
8384
"project": "auto-detect",
8485
"format": "text",
@@ -87,62 +88,57 @@
8788
"filenameOnly": "false",
8889
"recursive": "false",
8990
"encrypted": "false",
90-
"schema": '{"type":"record","name":"etlSchemaBody","fields":'
91-
'[{"name":"offset","type":"long"},{"name":"body","type":"string"}]}',
91+
"schema": "{\"type\":\"record\",\"name\":\"textfile\",\"fields\":[{\"name\"\
92+
:\"offset\",\"type\":\"long\"},{\"name\":\"body\",\"type\":\"string\"}]}",
9293
"path": BUCKET_1_URI,
9394
"referenceName": "foo_bucket",
95+
"useConnection": "false",
96+
"serviceAccountType": "filePath",
97+
"sampleSize": "1000",
98+
"fileEncoding": "UTF-8",
9499
},
95100
},
96-
"outputSchema": [
97-
{
98-
"name": "etlSchemaBody",
99-
"schema": '{"type":"record","name":"etlSchemaBody","fields":'
100-
'[{"name":"offset","type":"long"},{"name":"body","type":"string"}]}',
101-
}
102-
],
101+
"outputSchema": "{\"type\":\"record\",\"name\":\"textfile\",\"fields\"\
102+
:[{\"name\":\"offset\",\"type\":\"long\"},{\"name\":\"body\",\"type\":\"string\"}]}",
103+
"id": "GCS",
103104
},
104105
{
105106
"name": "GCS2",
106107
"plugin": {
107108
"name": "GCS",
108109
"type": "batchsink",
109110
"label": "GCS2",
110-
"artifact": {
111-
"name": "google-cloud",
112-
"version": "0.17.3",
113-
"scope": "SYSTEM",
114-
},
111+
"artifact": {"name": "google-cloud", "version": "0.18.1", "scope": "SYSTEM"},
115112
"properties": {
116113
"project": "auto-detect",
117114
"suffix": "yyyy-MM-dd-HH-mm",
118115
"format": "json",
119116
"serviceFilePath": "auto-detect",
120117
"location": "us",
121-
"schema": '{"type":"record","name":"etlSchemaBody","fields":'
122-
'[{"name":"offset","type":"long"},{"name":"body","type":"string"}]}',
118+
"schema": "{\"type\":\"record\",\"name\":\"textfile\",\"fields\":[{\"name\"\
119+
:\"offset\",\"type\":\"long\"},{\"name\":\"body\",\"type\":\"string\"}]}",
123120
"referenceName": "bar",
124121
"path": BUCKET_2_URI,
122+
"serviceAccountType": "filePath",
123+
"contentType": "application/octet-stream",
125124
},
126125
},
127-
"outputSchema": [
128-
{
129-
"name": "etlSchemaBody",
130-
"schema": '{"type":"record","name":"etlSchemaBody","fields":'
131-
'[{"name":"offset","type":"long"},{"name":"body","type":"string"}]}',
132-
}
133-
],
126+
"outputSchema": "{\"type\":\"record\",\"name\":\"textfile\",\"fields\"\
127+
:[{\"name\":\"offset\",\"type\":\"long\"},{\"name\":\"body\",\"type\":\"string\"}]}",
134128
"inputSchema": [
135129
{
136130
"name": "GCS",
137-
"schema": '{"type":"record","name":"etlSchemaBody","fields":'
138-
'[{"name":"offset","type":"long"},{"name":"body","type":"string"}]}',
131+
"schema": "{\"type\":\"record\",\"name\":\"textfile\",\"fields\":[{\"name\"\
132+
:\"offset\",\"type\":\"long\"},{\"name\":\"body\",\"type\":\"string\"}]}",
139133
}
140134
],
135+
"id": "GCS2",
141136
},
142137
],
143138
"schedule": "0 * * * *",
144139
"engine": "spark",
145140
"numOfRecordsPreview": 100,
141+
"description": "Data Pipeline Application",
146142
"maxConcurrentRuns": 1,
147143
},
148144
}

0 commit comments

Comments
 (0)