|
| 1 | +# |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 3 | +# or more contributor license agreements. See the NOTICE file |
| 4 | +# distributed with this work for additional information |
| 5 | +# regarding copyright ownership. The ASF licenses this file |
| 6 | +# to you under the Apache License, Version 2.0 (the |
| 7 | +# "License"); you may not use this file except in compliance |
| 8 | +# with the License. You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, |
| 13 | +# software distributed under the License is distributed on an |
| 14 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | +# KIND, either express or implied. See the License for the |
| 16 | +# specific language governing permissions and limitations |
| 17 | +# under the License. |
| 18 | +import os |
| 19 | +from urllib.parse import urlparse |
| 20 | + |
| 21 | +from cached_property import cached_property |
| 22 | + |
| 23 | +from airflow.configuration import conf |
| 24 | +from airflow.exceptions import AirflowException |
| 25 | +from airflow.utils.log.file_task_handler import FileTaskHandler |
| 26 | +from airflow.utils.log.logging_mixin import LoggingMixin |
| 27 | + |
| 28 | + |
| 29 | +class GCSTaskHandler(FileTaskHandler, LoggingMixin): |
| 30 | + """ |
| 31 | + GCSTaskHandler is a python log handler that handles and reads |
| 32 | + task instance logs. It extends airflow FileTaskHandler and |
| 33 | + uploads to and reads from GCS remote storage. Upon log reading |
| 34 | + failure, it reads from host machine's local disk. |
| 35 | + """ |
| 36 | + def __init__(self, base_log_folder, gcs_log_folder, filename_template): |
| 37 | + super().__init__(base_log_folder, filename_template) |
| 38 | + self.remote_base = gcs_log_folder |
| 39 | + self.log_relative_path = '' |
| 40 | + self._hook = None |
| 41 | + self.closed = False |
| 42 | + self.upload_on_close = True |
| 43 | + |
| 44 | + @cached_property |
| 45 | + def hook(self): |
| 46 | + """ |
| 47 | + Returns GCS hook. |
| 48 | + """ |
| 49 | + remote_conn_id = conf.get('logging', 'REMOTE_LOG_CONN_ID') |
| 50 | + try: |
| 51 | + from airflow.providers.google.cloud.hooks.gcs import GCSHook |
| 52 | + return GCSHook( |
| 53 | + google_cloud_storage_conn_id=remote_conn_id |
| 54 | + ) |
| 55 | + except Exception as e: # pylint: disable=broad-except |
| 56 | + self.log.error( |
| 57 | + 'Could not create a GoogleCloudStorageHook with connection id ' |
| 58 | + '"%s". %s\n\nPlease make sure that airflow[gcp] is installed ' |
| 59 | + 'and the GCS connection exists.', remote_conn_id, str(e) |
| 60 | + ) |
| 61 | + |
| 62 | + def set_context(self, ti): |
| 63 | + super().set_context(ti) |
| 64 | + # Log relative path is used to construct local and remote |
| 65 | + # log path to upload log files into GCS and read from the |
| 66 | + # remote location. |
| 67 | + self.log_relative_path = self._render_filename(ti, ti.try_number) |
| 68 | + self.upload_on_close = not ti.raw |
| 69 | + |
| 70 | + def close(self): |
| 71 | + """ |
| 72 | + Close and upload local log file to remote storage GCS. |
| 73 | + """ |
| 74 | + # When application exit, system shuts down all handlers by |
| 75 | + # calling close method. Here we check if logger is already |
| 76 | + # closed to prevent uploading the log to remote storage multiple |
| 77 | + # times when `logging.shutdown` is called. |
| 78 | + if self.closed: |
| 79 | + return |
| 80 | + |
| 81 | + super().close() |
| 82 | + |
| 83 | + if not self.upload_on_close: |
| 84 | + return |
| 85 | + |
| 86 | + local_loc = os.path.join(self.local_base, self.log_relative_path) |
| 87 | + remote_loc = os.path.join(self.remote_base, self.log_relative_path) |
| 88 | + if os.path.exists(local_loc): |
| 89 | + # read log and remove old logs to get just the latest additions |
| 90 | + with open(local_loc, 'r') as logfile: |
| 91 | + log = logfile.read() |
| 92 | + self.gcs_write(log, remote_loc) |
| 93 | + |
| 94 | + # Mark closed so we don't double write if close is called twice |
| 95 | + self.closed = True |
| 96 | + |
| 97 | + def _read(self, ti, try_number, metadata=None): |
| 98 | + """ |
| 99 | + Read logs of given task instance and try_number from GCS. |
| 100 | + If failed, read the log from task instance host machine. |
| 101 | +
|
| 102 | + :param ti: task instance object |
| 103 | + :param try_number: task instance try_number to read logs from |
| 104 | + :param metadata: log metadata, |
| 105 | + can be used for steaming log reading and auto-tailing. |
| 106 | + """ |
| 107 | + # Explicitly getting log relative path is necessary as the given |
| 108 | + # task instance might be different than task instance passed in |
| 109 | + # in set_context method. |
| 110 | + log_relative_path = self._render_filename(ti, try_number) |
| 111 | + remote_loc = os.path.join(self.remote_base, log_relative_path) |
| 112 | + |
| 113 | + try: |
| 114 | + remote_log = self.gcs_read(remote_loc) |
| 115 | + log = '*** Reading remote log from {}.\n{}\n'.format( |
| 116 | + remote_loc, remote_log) |
| 117 | + return log, {'end_of_log': True} |
| 118 | + except Exception as e: # pylint: disable=broad-except |
| 119 | + log = '*** Unable to read remote log from {}\n*** {}\n\n'.format( |
| 120 | + remote_loc, str(e)) |
| 121 | + self.log.error(log) |
| 122 | + local_log, metadata = super()._read(ti, try_number) |
| 123 | + log += local_log |
| 124 | + return log, metadata |
| 125 | + |
| 126 | + def gcs_read(self, remote_log_location): |
| 127 | + """ |
| 128 | + Returns the log found at the remote_log_location. |
| 129 | +
|
| 130 | + :param remote_log_location: the log's location in remote storage |
| 131 | + :type remote_log_location: str (path) |
| 132 | + """ |
| 133 | + bkt, blob = self.parse_gcs_url(remote_log_location) |
| 134 | + return self.hook.download(bkt, blob).decode('utf-8') |
| 135 | + |
| 136 | + def gcs_write(self, log, remote_log_location, append=True): |
| 137 | + """ |
| 138 | + Writes the log to the remote_log_location. Fails silently if no hook |
| 139 | + was created. |
| 140 | +
|
| 141 | + :param log: the log to write to the remote_log_location |
| 142 | + :type log: str |
| 143 | + :param remote_log_location: the log's location in remote storage |
| 144 | + :type remote_log_location: str (path) |
| 145 | + :param append: if False, any existing log file is overwritten. If True, |
| 146 | + the new log is appended to any existing logs. |
| 147 | + :type append: bool |
| 148 | + """ |
| 149 | + if append: |
| 150 | + try: |
| 151 | + old_log = self.gcs_read(remote_log_location) |
| 152 | + log = '\n'.join([old_log, log]) if old_log else log |
| 153 | + except Exception as e: # pylint: disable=broad-except |
| 154 | + if not hasattr(e, 'resp') or e.resp.get('status') != '404': # pylint: disable=no-member |
| 155 | + log = '*** Previous log discarded: {}\n\n'.format(str(e)) + log |
| 156 | + |
| 157 | + try: |
| 158 | + bkt, blob = self.parse_gcs_url(remote_log_location) |
| 159 | + from tempfile import NamedTemporaryFile |
| 160 | + with NamedTemporaryFile(mode='w+') as tmpfile: |
| 161 | + tmpfile.write(log) |
| 162 | + # Force the file to be flushed, since we're doing the |
| 163 | + # upload from within the file context (it hasn't been |
| 164 | + # closed). |
| 165 | + tmpfile.flush() |
| 166 | + self.hook.upload(bkt, blob, tmpfile.name) |
| 167 | + except Exception as e: # pylint: disable=broad-except |
| 168 | + self.log.error('Could not write logs to %s: %s', remote_log_location, e) |
| 169 | + |
| 170 | + @staticmethod |
| 171 | + def parse_gcs_url(gsurl): |
| 172 | + """ |
| 173 | + Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a |
| 174 | + tuple containing the corresponding bucket and blob. |
| 175 | + """ |
| 176 | + parsed_url = urlparse(gsurl) |
| 177 | + if not parsed_url.netloc: |
| 178 | + raise AirflowException('Please provide a bucket name') |
| 179 | + else: |
| 180 | + bucket = parsed_url.netloc |
| 181 | + blob = parsed_url.path.strip('/') |
| 182 | + return bucket, blob |
0 commit comments