-
Notifications
You must be signed in to change notification settings - Fork 322
Closed
Labels
api: bigqueryIssues related to the googleapis/python-bigquery API.Issues related to the googleapis/python-bigquery API.priority: p1Important issue which blocks shipping the next release. Will be fixed prior to next release.Important issue which blocks shipping the next release. Will be fixed prior to next release.type: bugError or flaw in code with unintended results or allowing sub-optimal usage patterns.Error or flaw in code with unintended results or allowing sub-optimal usage patterns.
Description
Environment details
- OS type and version: Google Colab
- Python version:
python --version
3.11 - pip version:
pip --version
N/A google-cloud-bigquery
version:pip show google-cloud-bigquery
3.25.0
Steps to reproduce
It's a bit difficult to reproduce, but it seems to happen most often with creating a vector index via CREATE VECTOR INDEX DDL as generated by bigframes.bigquery.create_vector_index.
Code example
Long Python sample, sorry
PROJECT_ID = "my-project" # @param {type:"string"}
DATASET_ID = "my_dataset" # @param {type:"string"}
CONNECTION_ID = "bqml_llm_conn" # @param {type:"string"}
# Set the project id
! gcloud config set project {PROJECT_ID}
REGION = "US" # @param {type: "string"}
## Text embedding variables
TEXT_TABLE_ID = "PATENT_TABLE_BF"
TEXT_MODEL_ID = "llm_embedding_model"
TEXT_EMBEDDING_TABLE_ID = "patent_embedding_BF_test"
TEXT_EMBEDDING_TABLE_ID2 = "patent_embedding"
TEXT_MODEL_ENDPOINT = "text-embedding-004"
TEXT_VECTOR_INDEX_ID = "patent_index_BF"
import bigframes.pandas as bf
import bigframes.ml as bf_ml
import bigframes.bigquery as bf_bq
import bigframes.ml.llm as bf_llm
from google.cloud import bigquery
from google.cloud import storage
# Construct a BigQuery client object.
client = bigquery.Client()
import pandas as pd
from IPython.display import Image, display
from PIL import Image as PILImage
import io
bf.options.bigquery.project = PROJECT_ID
bf.options.bigquery.location = REGION
bf.options.bigquery.ordering_mode = "partial"
text_model = bf_llm.TextEmbeddingGenerator() ## which endpoint this uses??
publications = bf.read_gbq('patents-public-data.google_patents_research.publications')
keep = (publications.embedding_v1.str.len() > 0) & (publications.title.str.len() > 0) & (publications.abstract.str.len() > 0)
publications = publications[["publication_number", "title", "abstract"]].rename(columns={'abstract': 'content'})
## creating embeddings for 10000 rows only for testing, change in the line below for embedding higher number of rows
publications_subset = publications[publications["content"].str.len() > 30].peek(10000)
## publications_subset = publications_subset[["publication_number", "title", "abstract"]].rename(columns={'abstract': 'content'})
embedding = text_model.predict(publications_subset)[["publication_number", "title", "content", "ml_generate_embedding_result","ml_generate_embedding_status"]]
## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful
embedding = embedding[~embedding["ml_generate_embedding_status"].isnull()]
# store embeddings in a BQ table
embedding.to_gbq(f"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}", if_exists='replace')
## python code to create index
bf_bq.create_vector_index(
table_id = f"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}",
column_name = "ml_generate_embedding_result",
replace= True,
index_name = "bf_python_index",
distance_type="cosine",
index_type= "ivf"
)
Unfortunately, even this is not a guarantee, but it does seem pretty reproducible on one of the projects my colleague is using.
Stack trace
/usr/local/lib/python3.10/dist-packages/bigframes/session/_io/bigquery/__init__.py in start_query_with_client(bq_client, sql, job_config, max_results, timeout, api_name, metrics)
229
230 try:
--> 231 query_job = bq_client.query(sql, job_config=job_config, timeout=timeout)
232 except google.api_core.exceptions.Forbidden as ex:
233 if "Drive credentials" in ex.message:
/usr/local/lib/python3.10/dist-packages/google/cloud/bigquery/client.py in query(self, query, job_config, job_id, job_id_prefix, location, project, retry, timeout, job_retry, api_method)
3490 )
3491 elif api_method == enums.QueryApiMethod.INSERT:
-> 3492 return _job_helpers.query_jobs_insert(
3493 self,
3494 query,
/usr/local/lib/python3.10/dist-packages/google/cloud/bigquery/_job_helpers.py in query_jobs_insert(client, query, job_config, job_id, job_id_prefix, location, project, retry, timeout, job_retry)
157 return query_job
158
--> 159 future = do_query()
160 # The future might be in a failed state now, but if it's
161 # unrecoverable, we'll find out when we ask for it's result, at which
/usr/local/lib/python3.10/dist-packages/google/cloud/bigquery/_job_helpers.py in do_query()
143
144 try:
--> 145 query_job = client.get_job(
146 job_id,
147 project=project,
/usr/local/lib/python3.10/dist-packages/google/cloud/bigquery/client.py in get_job(self, job_id, project, location, retry, timeout)
2199 span_attributes = {"path": path, "job_id": job_id, "location": location}
2200
-> 2201 resource = self._call_api(
2202 retry,
2203 span_name="BigQuery.getJob",
/usr/local/lib/python3.10/dist-packages/google/cloud/bigquery/client.py in _call_api(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)
831 name=span_name, attributes=span_attributes, client=self, job_ref=job_ref
832 ):
--> 833 return call()
834
835 return call()
/usr/local/lib/python3.10/dist-packages/google/api_core/retry/retry_unary.py in retry_wrapped_func(*args, **kwargs)
291 self._initial, self._maximum, multiplier=self._multiplier
292 )
--> 293 return retry_target(
294 target,
295 self._predicate,
/usr/local/lib/python3.10/dist-packages/google/api_core/retry/retry_unary.py in retry_target(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)
151 except Exception as exc:
152 # defer to shared logic for handling errors
--> 153 _retry_error_helper(
154 exc,
155 deadline,
/usr/local/lib/python3.10/dist-packages/google/api_core/retry/retry_base.py in _retry_error_helper(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)
210 original_timeout,
211 )
--> 212 raise final_exc from source_exc
213 if on_error_fn is not None:
214 on_error_fn(exc)
/usr/local/lib/python3.10/dist-packages/google/api_core/retry/retry_unary.py in retry_target(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)
142 for sleep in sleep_generator:
143 try:
--> 144 result = target()
145 if inspect.isawaitable(result):
146 warnings.warn(_ASYNC_RETRY_WARNING)
/usr/local/lib/python3.10/dist-packages/google/cloud/_http/__init__.py in api_request(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)
492
493 if not 200 <= response.status_code < 300:
--> 494 raise exceptions.from_http_response(response)
495
496 if expect_json and response.content:
NotFound: 404 GET https://bigquery.googleapis.com/bigquery/v2/projects/my-project/jobs/c540a00e-05ed-4caf-b76c-b23d16b9617f?projection=full&location=US&prettyPrint=false: Not found: Job my-project:US.c540a00e-05ed-4caf-b76c-b23d16b9617f
Note that it is getting a 404 here:
query_job = client.get_job( |
This is after a Conflict exception, meaning the job was created successfully, the retry failed because it already exists, but then getting that already existing job failed with 404. Something funky is going on with the networking I think for this case to happen, but it should be possible to simulate in unit tests, at least.
Metadata
Metadata
Assignees
Labels
api: bigqueryIssues related to the googleapis/python-bigquery API.Issues related to the googleapis/python-bigquery API.priority: p1Important issue which blocks shipping the next release. Will be fixed prior to next release.Important issue which blocks shipping the next release. Will be fixed prior to next release.type: bugError or flaw in code with unintended results or allowing sub-optimal usage patterns.Error or flaw in code with unintended results or allowing sub-optimal usage patterns.