gcpdiag.queries.bigquery

Queries related to BigQuery.

BIGQUERY_REGIONS = ['me-central1', 'me-central2', 'me-west1', 'africa-south1', 'us', 'eu', 'us-east1', 'us-east4', 'us-east5', 'us-west1', 'us-west2', 'us-west3', 'us-west4', 'us-central1', 'us-south1', 'northamerica-northeast1', 'northamerica-northeast2', 'southamerica-east1', 'southamerica-west1', 'asia-east1', 'asia-east2', 'asia-south1', 'asia-south2', 'asia-northeast1', 'asia-northeast2', 'asia-northeast3', 'asia-southeast1', 'asia-southeast2', 'australia-southeast1', 'australia-southeast2', 'europe-north1', 'europe-southwest1', 'europe-central2', 'europe-west1', 'europe-west10', 'europe-west2', 'europe-west3', 'europe-west4', 'europe-west6', 'europe-west8', 'europe-west9', 'europe-west12']

C_NOT_AVAILABLE = 'N/A'

PolicyObject = typing.Union[gcpdiag.queries.iam.ProjectPolicy, gcpdiag.queries.iam.OrganizationPolicy]

def get_project_policy(project_id: str): View Source

66def get_project_policy(project_id: str):
67  """Fetches the IAM policy object for a project."""
68  root_logger = logging.getLogger()
69  original_level = root_logger.level
70
71  try:
72    root_logger.setLevel(logging.ERROR)
73    policy = iam.get_project_policy(project_id, raise_error_if_fails=False)
74    return policy
75  except utils.GcpApiError:
76    return None
77  finally:
78    root_logger.setLevel(original_level)

Fetches the IAM policy object for a project.

def get_organization_policy(organization_id: str): View Source

 81def get_organization_policy(organization_id: str):
 82  """Fetches the IAM policy object for an organization."""
 83  root_logger = logging.getLogger()
 84  original_level = root_logger.level
 85
 86  try:
 87    root_logger.setLevel(logging.ERROR)
 88    policy = iam.get_organization_policy(organization_id,
 89                                         raise_error_if_fails=False)
 90    return policy
 91  except utils.GcpApiError as err:
 92    if 'doesn\'t have access to' in err.message.lower(
 93    ) or 'denied on resource' in err.message.lower():
 94      op.info(
 95          'User does not have access to the organization policy. Investigation'
 96          ' completeness and accuracy might depend on the presence of'
 97          ' organization level permissions.')
 98    return None
 99  finally:
100    root_logger.setLevel(original_level)

Fetches the IAM policy object for an organization.

def check_permissions_for_principal( policy: Union[gcpdiag.queries.iam.ProjectPolicy, gcpdiag.queries.iam.OrganizationPolicy], principal: str, permissions_to_check: Set[str]) -> Dict[str, bool]: View Source

103def check_permissions_for_principal(
104    policy: PolicyObject, principal: str,
105    permissions_to_check: Set[str]) -> Dict[str, bool]:
106  """Uses a policy object to check a set of permissions for a principal.
107
108  Returns a dictionary mapping each permission to a boolean indicating its
109  presence.
110  """
111  return {
112      permission: policy.has_permission(principal, permission)
113      for permission in permissions_to_check
114  }

Uses a policy object to check a set of permissions for a principal.

Returns a dictionary mapping each permission to a boolean indicating its presence.

def get_missing_permissions( required_permissions: Set[str], actual_permissions: Dict[str, bool]) -> Set[str]: View Source

117def get_missing_permissions(required_permissions: Set[str],
118                            actual_permissions: Dict[str, bool]) -> Set[str]:
119  """Compares a set of required permissions against a dictionary of actual
120
121  permissions and returns the set of missing ones.
122  """
123  return {
124      perm for perm in required_permissions if not actual_permissions.get(perm)
125  }

Compares a set of required permissions against a dictionary of actual

permissions and returns the set of missing ones.

class BigQueryTable: View Source

128class BigQueryTable:
129  """Represents a BigQuery Table object."""
130
131  project_id: str
132  dataset_id: str
133  table_id: str
134
135  def __init__(self, project_id: str, dataset_id: str, table_id: str):
136    self.project_id = project_id
137    self.dataset_id = dataset_id
138    self.table_id = table_id
139
140  @property
141  def table_identifier(self) -> str:
142    return f'{self.project_id}:{self.dataset_id}.{self.table_id}'

Represents a BigQuery Table object.

BigQueryTable(project_id: str, dataset_id: str, table_id: str) View Source

135  def __init__(self, project_id: str, dataset_id: str, table_id: str):
136    self.project_id = project_id
137    self.dataset_id = dataset_id
138    self.table_id = table_id

project_id: str

dataset_id: str

table_id: str

table_identifier: str View Source

140  @property
141  def table_identifier(self) -> str:
142    return f'{self.project_id}:{self.dataset_id}.{self.table_id}'

class BigQueryRoutine: View Source

145class BigQueryRoutine:
146  """Represents a BigQuery Routine object."""
147
148  project_id: str
149  dataset_id: str
150  routine_id: str
151
152  def __init__(self, project_id: str, dataset_id: str, routine_id: str):
153    self.project_id = project_id
154    self.dataset_id = dataset_id
155    self.routine_id = routine_id
156
157  @property
158  def routine_identifier(self) -> str:
159    return f'{self.project_id}:{self.dataset_id}.{self.routine_id}'

Represents a BigQuery Routine object.

BigQueryRoutine(project_id: str, dataset_id: str, routine_id: str) View Source

152  def __init__(self, project_id: str, dataset_id: str, routine_id: str):
153    self.project_id = project_id
154    self.dataset_id = dataset_id
155    self.routine_id = routine_id

project_id: str

dataset_id: str

routine_id: str

routine_identifier: str View Source

157  @property
158  def routine_identifier(self) -> str:
159    return f'{self.project_id}:{self.dataset_id}.{self.routine_id}'

@caching.cached_api_call

def get_bigquery_job_api_resource_data(project_id: str, region: str, job_id: str) -> Optional[dict[str, Any]]: View Source

737@caching.cached_api_call
738def get_bigquery_job_api_resource_data(
739    project_id: str,
740    region: str,
741    job_id: str,
742) -> Union[dict[str, Any], None]:
743  """Fetch a specific BigQuery job's raw API resource data."""
744  api = apis.get_api('bigquery', 'v2', project_id)
745  query_job = api.jobs().get(projectId=project_id,
746                             location=region,
747                             jobId=job_id)
748
749  try:
750    resp = query_job.execute(num_retries=config.API_RETRIES)
751    return resp
752  except errors.HttpError as err:
753    raise utils.GcpApiError(err) from err

Fetch a specific BigQuery job's raw API resource data.

@caching.cached_api_call

def get_information_schema_job_metadata( project_id: str, region: str, job_id: str, creation_time_milis: Optional[int] = None, skip_permission_check: bool = False) -> Optional[dict[str, Any]]: View Source

756@caching.cached_api_call
757def get_information_schema_job_metadata(
758    project_id: str,
759    region: str,
760    job_id: str,
761    creation_time_milis: Optional[int] = None,
762    skip_permission_check: bool = False,
763) -> Optional[dict[str, Any]]:
764  """Fetch metadata about a BigQuery job from the INFORMATION_SCHEMA."""
765  if not apis.is_enabled(project_id, 'bigquery'):
766    return None
767  user_email = ''
768  try:
769    user_email = apis.get_user_email()
770  except (RuntimeError, exceptions.DefaultCredentialsError):
771    pass
772  except AttributeError as err:
773    if (('has no attribute' in str(err)) and
774        ('with_quota_project' in str(err))):
775      op.info('Running the investigation within the GCA context.')
776  user = 'user:' + user_email
777  if not skip_permission_check:
778    try:
779      policy = iam.get_project_policy(project_id)
780      if (not policy.has_permission(user, 'bigquery.jobs.create')) or (
781          not policy.has_permission(user, 'bigquery.jobs.listAll')):
782        op.info(
783            f'WARNING: Unable to run INFORMATION_SCHEMA view analysis due to missing permissions.\
784            \nMake sure to grant {user_email} "bigquery.jobs.create" and "bigquery.jobs.listAll".\
785            \nContinuing the investigation with the BigQuery job metadata obtained from the API.'
786        )
787        return None
788    except utils.GcpApiError:
789      op.info(
790          'Attempting to query INFORMATION_SCHEMA with no knowledge of project'
791          ' level permissions        \n(due to missing'
792          ' resourcemanager.projects.get permission).')
793  else:
794    op.info(
795        'Attempting to query INFORMATION_SCHEMA without checking project level permissions.'
796    )
797  try:
798    creation_time_milis_filter = ' '
799    if creation_time_milis:
800      creation_time_milis_filter = (
801          f'AND creation_time = TIMESTAMP_MILLIS({creation_time_milis})')
802    query = f"""
803    SELECT
804        user_email, start_time, end_time, query
805      FROM
806        `{project_id}`.`region-{region}`.INFORMATION_SCHEMA.JOBS
807      WHERE
808        job_id = '{job_id}'
809        {creation_time_milis_filter}
810      LIMIT 1
811    """
812    results = get_query_results(
813        project_id=project_id,
814        query=query,
815        location=region,
816        timeout_sec=30,
817        poll_interval_sec=2,  # Short poll interval
818    )
819    if not results or len(results) != 1:
820      # We cannot raise an exception otherwise tests that use get_bigquery_job would fail
821      # raise ValueError(f"Job {job_id} not found in INFORMATION_SCHEMA")
822      return None
823    return results[0]
824  except errors.HttpError as err:
825    logging.warning(
826        'Failed to retrieve INFORMATION_SCHEMA job metadata for job %s: %s',
827        job_id,
828        err,
829    )
830    return None
831  except KeyError as err:
832    logging.warning(
833        'Failed to parse INFORMATION_SCHEMA response for job %s: %s',
834        job_id,
835        err,
836    )
837    return None

Fetch metadata about a BigQuery job from the INFORMATION_SCHEMA.

def get_bigquery_job( project_id: str, region: str, job_id: str, skip_permission_check: bool = False) -> Optional[BigQueryJob]: View Source

840def get_bigquery_job(
841    project_id: str,
842    region: str,
843    job_id: str,
844    skip_permission_check: bool = False) -> Union[BigQueryJob, None]:
845  """Fetch a BigQuery job, combining API and INFORMATION_SCHEMA data."""
846  try:
847    job_api_resource_data = get_bigquery_job_api_resource_data(
848        project_id, region, job_id)
849    if not job_api_resource_data:
850      return None
851  except utils.GcpApiError as err:
852    # This will be returned when permissions to fetch a job are missing.
853    if 'permission' in err.message.lower():
854      user_email = ''
855      try:
856        user_email = apis.get_user_email()
857      except (RuntimeError, AttributeError,
858              exceptions.DefaultCredentialsError) as error:
859        if (('has no attribute' in str(error)) and
860            ('with_quota_project' in str(error))):
861          op.info('Running the investigation within the GCA context.')
862      logging.debug(('Could not retrieve BigQuery job %s.\
863          \n make sure to give the bigquery.jobs.get and bigquery.jobs.create permissions to %s',
864                     (project_id + ':' + region + '.' + job_id), user_email))
865      raise utils.GcpApiError(err)
866    # This will be returned when a job is not found.
867    elif 'not found' in err.message.lower():
868      job_id_string = project_id + ':' + region + '.' + job_id
869      logging.debug('Could not find BigQuery job %s', job_id_string)
870      return None
871    else:
872      logging.debug((
873          'Could not retrieve BigQuery job %s due to an issue calling the API. \
874            Please restart the investigation.',
875          (project_id + ':' + region + '.' + job_id)))
876      return None
877  information_schema_job_metadata = {}
878  job_creation_millis = None
879  creation_time_str = job_api_resource_data.get('statistics',
880                                                {}).get('creationTime')
881  if creation_time_str:
882    try:
883      job_creation_millis = int(creation_time_str)
884    except (ValueError, TypeError):
885      pass
886  information_schema_job_metadata = get_information_schema_job_metadata(
887      project_id, region, job_id, job_creation_millis, skip_permission_check)
888  return BigQueryJob(
889      project_id=project_id,
890      job_api_resource_data=job_api_resource_data,
891      information_schema_job_metadata=information_schema_job_metadata)

Fetch a BigQuery job, combining API and INFORMATION_SCHEMA data.

def get_query_results( project_id: str, query: str, location: Optional[str] = None, timeout_sec: int = 30, poll_interval_sec: int = 2) -> Optional[List[dict[str, Any]]]: View Source

 931def get_query_results(
 932    project_id: str,
 933    query: str,
 934    location: Optional[str] = None,
 935    timeout_sec: int = 30,
 936    poll_interval_sec: int = 2,
 937) -> Optional[List[dict[str, Any]]]:
 938  """Executes a BigQuery query, waits for completion, and returns the results.
 939
 940  Args:
 941      project_id: The GCP project ID where the query should run.
 942      query: The SQL query string to execute.
 943      location: The location (e.g., 'US', 'EU', 'us-central1') where the job
 944        should run. If None, BigQuery defaults might apply, often based on
 945        dataset locations if referenced.
 946      timeout_sec: Maximum time in seconds to wait for the query job to
 947        complete.
 948      poll_interval_sec: Time in seconds between polling the job status.
 949
 950  Returns:
 951      A list of dictionaries representing the result rows, or None if the
 952      query fails, times out, or the API is disabled.
 953  Raises:
 954      utils.GcpApiError: If an unrecoverable API error occurs during job
 955                         insertion, status check, or result fetching.
 956  """
 957  if not apis.is_enabled(project_id, 'bigquery'):
 958    logging.warning('BigQuery API is not enabled in project %s.', project_id)
 959    return None
 960  api = apis.get_api('bigquery', 'v2', project_id)
 961  job_id = f'gcpdiag_query_{uuid.uuid4()}'
 962  job_body = {
 963      'jobReference': {
 964          'projectId': project_id,
 965          'jobId': job_id,
 966          'location': location,  # Location can be None
 967      },
 968      'configuration': {
 969          'query': {
 970              'query': query,
 971              'useLegacySql': False,
 972              # Consider adding priority, destinationTable, etc. if needed
 973          }
 974      },
 975  }
 976  try:
 977    logging.debug(
 978        'Starting BigQuery job %s in project %s, location %s',
 979        job_id,
 980        project_id,
 981        location or 'default',
 982    )
 983    insert_request = api.jobs().insert(projectId=project_id, body=job_body)
 984    insert_response = insert_request.execute(num_retries=config.API_RETRIES)
 985    job_ref = insert_response['jobReference']
 986    actual_job_id = job_ref['jobId']
 987    actual_location = job_ref.get('location')  # Get location assigned by BQ
 988    logging.debug('Job %s created. Polling for completion...', actual_job_id)
 989    start_time = time.time()
 990    while True:
 991      # Check for timeout
 992      if time.time() - start_time > timeout_sec:
 993        logging.error(
 994            'BigQuery job %s timed out after %d seconds.',
 995            actual_job_id,
 996            timeout_sec,
 997        )
 998        return None
 999      # Get job status
1000      logging.debug('>>> Getting job status for %s', actual_job_id)
1001      get_request = api.jobs().get(
1002          projectId=job_ref['projectId'],
1003          jobId=actual_job_id,
1004          location=actual_location,
1005      )
1006      job_status_response = get_request.execute(num_retries=config.API_RETRIES)
1007      status = job_status_response.get('status', {})
1008      logging.debug('>>> Job status: %s', status.get('state'))
1009      if status.get('state') == 'DONE':
1010        if status.get('errorResult'):
1011          error_info = status['errorResult']
1012          if 'User does not have permission to query table' in error_info.get(
1013              'message'):
1014            op.info(
1015                error_info.get('message')[15:] +
1016                '\nContinuing the investigation with the job metadata obtained from the API.'
1017            )
1018          else:
1019            error_info = status['errorResult']
1020            logging.error(
1021                'BigQuery job %s failed. Reason: %s, Message: %s',
1022                actual_job_id,
1023                error_info.get('reason'),
1024                error_info.get('message'),
1025            )
1026            # Log detailed errors if available
1027            for error in status.get('errors', []):
1028              logging.error(
1029                  '  - Detail: %s (Location: %s)',
1030                  error.get('message'),
1031                  error.get('location'),
1032              )
1033          return None
1034        else:
1035          logging.debug('BigQuery job %s completed successfully.',
1036                        actual_job_id)
1037          break  # Job finished successfully
1038      elif status.get('state') in ['PENDING', 'RUNNING']:
1039        logging.debug('>>> Job running, sleeping...')
1040        # Job still running, wait and poll again
1041        time.sleep(poll_interval_sec)
1042      else:
1043        # Unexpected state
1044        logging.error(
1045            'BigQuery job %s entered unexpected state: %s',
1046            actual_job_id,
1047            status.get('state', 'UNKNOWN'),
1048        )
1049        return None
1050    # Fetch results
1051    logging.debug('>>> Fetching results for job %s...',
1052                  actual_job_id)  # <-- ADD
1053    results_request = api.jobs().getQueryResults(
1054        projectId=job_ref['projectId'],
1055        jobId=actual_job_id,
1056        location=actual_location,
1057        # Add startIndex, maxResults for pagination if needed
1058    )
1059    results_response = results_request.execute(num_retries=config.API_RETRIES)
1060    # Check if job actually completed (getQueryResults might return before DONE sometimes)
1061    if not results_response.get('jobComplete', False):
1062      logging.warning(
1063          'getQueryResults returned jobComplete=False for job %s, results might'
1064          ' be incomplete.',
1065          actual_job_id,
1066      )
1067      # Decide if you want to wait longer or return potentially partial results
1068    rows = []
1069    if 'rows' in results_response and 'schema' in results_response:
1070      schema_fields = results_response['schema'].get('fields')
1071      if not schema_fields:
1072        return []
1073      for row_data in results_response['rows']:
1074        if 'f' in row_data:
1075          rows.append(_parse_row(schema_fields, row_data['f']))
1076    if results_response.get('pageToken'):
1077      logging.warning(
1078          'Query results for job %s are paginated, but pagination '
1079          'is not yet implemented.',
1080          actual_job_id,
1081      )
1082    return rows
1083  except errors.HttpError as err:
1084    logging.error('API error during BigQuery query execution for job %s: %s',
1085                  job_id, err)
1086    # Raise specific GcpApiError if needed for upstream handling
1087    raise utils.GcpApiError(err) from err
1088  except Exception as e:
1089    logging.exception(
1090        'Unexpected error during BigQuery query execution for job %s: %s',
1091        job_id,
1092        e,
1093    )
1094    # Re-raise or handle as appropriate
1095    raise

Executes a BigQuery query, waits for completion, and returns the results.

Arguments:

project_id: The GCP project ID where the query should run.
query: The SQL query string to execute.
location: The location (e.g., 'US', 'EU', 'us-central1') where the job should run. If None, BigQuery defaults might apply, often based on dataset locations if referenced.
timeout_sec: Maximum time in seconds to wait for the query job to complete.
poll_interval_sec: Time in seconds between polling the job status.

Returns:

A list of dictionaries representing the result rows, or None if the query fails, times out, or the API is disabled.

Raises:

utils.GcpApiError: If an unrecoverable API error occurs during job insertion, status check, or result fetching.

@caching.cached_api_call

def get_bigquery_project(project_id: str) -> gcpdiag.queries.crm.Project: View Source

1098@caching.cached_api_call
1099def get_bigquery_project(project_id: str) -> crm.Project:
1100  """Attempts to retrieve project details for the supplied BigQuery project id or number.
1101
1102    If the project is found/accessible, it returns a Project object with the resource data.
1103    If the project cannot be retrieved, the application raises one of the exceptions below.
1104    The get_bigquery_project method avoids unnecessary printing of the error message to keep
1105    the user interface of the tool cleaner to focus on meaningful investigation results.
1106    Corresponding errors are handled gracefully downstream.
1107
1108    Args:
1109        project_id (str): The project id or number of
1110        the project (e.g., "123456789", "example-project").
1111
1112    Returns:
1113        Project: An object representing the BigQuery project's full details.
1114
1115    Raises:
1116        utils.GcpApiError: If there is an issue calling the GCP/HTTP Error API.
1117
1118    Usage:
1119        When using project identifier from gcpdiag.models.Context
1120
1121        project = crm.get_project(context.project_id)
1122
1123        An unknown project identifier
1124        try:
1125          project = crm.get_project("123456789")
1126        except:
1127          # Handle exception
1128        else:
1129          # use project data
1130  """
1131  try:
1132    logging.debug('retrieving project %s ', project_id)
1133    crm_api = apis.get_api('cloudresourcemanager', 'v3', project_id)
1134    request = crm_api.projects().get(name=f'projects/{project_id}')
1135    response = request.execute(num_retries=config.API_RETRIES)
1136  except errors.HttpError as e:
1137    error = utils.GcpApiError(response=e)
1138    raise error from e
1139  else:
1140    return crm.Project(resource_data=response)

Attempts to retrieve project details for the supplied BigQuery project id or number.

If the project is found/accessible, it returns a Project object with the resource data. If the project cannot be retrieved, the application raises one of the exceptions below. The get_bigquery_project method avoids unnecessary printing of the error message to keep the user interface of the tool cleaner to focus on meaningful investigation results. Corresponding errors are handled gracefully downstream.

Arguments:

project_id (str): The project id or number of
the project (e.g., "123456789", "example-project").

Returns:

Project: An object representing the BigQuery project's full details.

Raises:

utils.GcpApiError: If there is an issue calling the GCP/HTTP Error API.

Usage:

When using project identifier from gcpdiag.models.Context

project = crm.get_project(context.project_id)

An unknown project identifier try: project = crm.get_project("123456789") except: # Handle exception else: # use project data

built with pdoc logo