Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/_config/bigquery_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import google.api_core.exceptions
import google.auth.credentials
import jellyfish

import bigframes.constants
import bigframes.enums
Expand All @@ -37,6 +36,7 @@


def _get_validated_location(value: Optional[str]) -> Optional[str]:
import bigframes._tools.strings

if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
return value
Expand All @@ -53,7 +53,7 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]:

possibility = min(
bigframes.constants.ALL_BIGQUERY_LOCATIONS,
key=lambda item: jellyfish.levenshtein_distance(location, item),
key=lambda item: bigframes._tools.strings.levenshtein_distance(location, item),
)
# There are many layers before we get to (possibly) the user's code:
# -> bpd.options.bigquery.location = "us-central-1"
Expand Down
19 changes: 19 additions & 0 deletions bigframes/_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""_tools is a collection of helper functions with minimal dependencies.

Please keep the dependencies used in this subpackage to a minimum to avoid the
risk of circular dependencies.
"""
66 changes: 66 additions & 0 deletions bigframes/_tools/strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helper methods for processing strings with minimal dependencies.

Please keep the dependencies used in this subpackage to a minimum to avoid the
risk of circular dependencies.
"""

import numpy


def levenshtein_distance(left: str, right: str) -> int:
"""Compute the edit distance between two strings.

This is the minumum number of substitutions, insertions, deletions
to get from left string to right string. See:
https://en.wikipedia.org/wiki/Levenshtein_distance
"""
# TODO(tswast): accelerate with numba (if available) if we end up using this
# function in contexts other than when raising an exception or there are too
# many values to compare even in that context.

distances0 = numpy.zeros(len(right) + 1)
distances1 = numpy.zeros(len(right) + 1)

# Maximum distance is to drop all characters and then add the other string.
distances0[:] = range(len(right) + 1)

for left_index in range(len(left)):
# Calculate distance from distances0 to distances1.

# Edit distance is to delete (i + 1) chars from left to match empty right
distances1[0] = left_index + 1
# "ab"
for right_index in range(len(right)):
left_char = left[left_index]
right_char = right[right_index]

deletion_cost = distances0[right_index + 1] + 1
insertion_cost = distances1[right_index] + 1
if left_char == right_char:
substitution_cost = distances0[right_index]
else:
substitution_cost = distances0[right_index] + 1

distances1[right_index + 1] = min(
deletion_cost, insertion_cost, substitution_cost
)

temp = distances0
distances0 = distances1
distances1 = temp

return distances0[len(right)]
5 changes: 3 additions & 2 deletions bigframes/core/groupby/dataframe_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
import jellyfish
import pandas as pd

from bigframes import session
Expand Down Expand Up @@ -87,6 +86,8 @@ def __getitem__(
typing.Sequence[blocks.Label],
],
):
import bigframes._tools.strings

if utils.is_list_like(key):
keys = list(key)
else:
Expand All @@ -101,7 +102,7 @@ def __getitem__(
possible_key.append(
min(
self._block.column_labels,
key=lambda item: jellyfish.damerau_levenshtein_distance(
key=lambda item: bigframes._tools.strings.levenshtein_distance(
bad_key, item
),
)
Expand Down
10 changes: 7 additions & 3 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import google.cloud.bigquery_storage_v1
import google.cloud.functions_v2
import google.cloud.resourcemanager_v3
import jellyfish
import pandas
import pandas_gbq.schema.pandas_to_bigquery # type: ignore

Expand Down Expand Up @@ -296,6 +295,7 @@ def read_gbq_table(
filters: third_party_pandas_gbq.FiltersType = (),
enable_snapshot: bool = True,
) -> dataframe.DataFrame:
import bigframes._tools.strings
import bigframes.dataframe as dataframe

# ---------------------------------
Expand Down Expand Up @@ -336,7 +336,9 @@ def read_gbq_table(
if key not in table_column_names:
possibility = min(
table_column_names,
key=lambda item: jellyfish.levenshtein_distance(key, item),
key=lambda item: bigframes._tools.strings.levenshtein_distance(
key, item
),
)
raise ValueError(
f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?"
Expand All @@ -354,7 +356,9 @@ def read_gbq_table(
if key not in table_column_names:
possibility = min(
table_column_names,
key=lambda item: jellyfish.levenshtein_distance(key, item),
key=lambda item: bigframes._tools.strings.levenshtein_distance(
key, item
),
)
raise ValueError(
f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?"
Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@
"google-cloud-iam >=2.12.1",
"google-cloud-resource-manager >=1.10.3",
"google-cloud-storage >=2.0.0",
# Upper bound due to no windows build for 1.1.2
"jellyfish >=0.8.9,<1.1.2",
"numpy >=1.24.0",
"pandas >=1.5.3",
"pandas-gbq >=0.26.1",
Expand Down
1 change: 0 additions & 1 deletion testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ google-cloud-bigquery-connection==1.12.0
google-cloud-iam==2.12.1
google-cloud-resource-manager==1.10.3
google-cloud-storage==2.0.0
jellyfish==0.8.9
numpy==1.24.0
pandas==1.5.3
pandas-gbq==0.26.1
Expand Down
19 changes: 19 additions & 0 deletions tests/unit/_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for helper methods for processing Python objects with minimal dependencies.

Please keep the dependencies used in this subpackage to a minimum to avoid the
risk of circular dependencies.
"""
149 changes: 149 additions & 0 deletions tests/unit/_tools/test_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for helper methods for processing strings with minimal dependencies.

Please keep the dependencies used in this subpackage to a minimum to avoid the
risk of circular dependencies.
"""

import base64
import random
import sys
import uuid

import pytest

from bigframes._tools import strings

# To stress test some unicode comparisons.
# https://stackoverflow.com/a/39682429/101923
ALL_UNICODE_CHARS = "".join(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())
RANDOM_STRINGS = (
pytest.param(str(uuid.uuid4()), id="uuid4"),
pytest.param(hex(random.randint(0, sys.maxsize)), id="hex"),
pytest.param(
base64.b64encode(
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(100)).encode(
"utf-8"
)
).decode("utf-8"),
id="base64",
),
pytest.param(
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(8)), id="unicode8"
),
pytest.param(
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(64)), id="unicode64"
),
)


def random_char_not_equal(avoid: str):
random_char = avoid
while random_char == avoid:
random_char = random.choice(ALL_UNICODE_CHARS)
return random_char


def random_deletion(original: str):
"""original string with one character removed"""
char_index = random.randrange(len(original))
return original[:char_index] + original[char_index + 1 :]


def random_insertion(original: str):
char_index = random.randrange(len(original))
random_char = random.choice(ALL_UNICODE_CHARS)
return original[: char_index + 1] + random_char + original[char_index + 1 :]


@pytest.mark.parametrize(
("left", "right", "expected"),
(
("", "", 0),
("abc", "abc", 0),
# Deletions
("abcxyz", "abc", 3),
("xyzabc", "abc", 3),
("AXYZBC", "ABC", 3),
("AXYZBC", "XYZ", 3),
# Insertions
("abc", "abcxyz", 3),
("abc", "xyzabc", 3),
# Substitutions
("abc", "aBc", 1),
("abcxyz", "aBcXyZ", 3),
# Combinations
("abcdefxyz", "abcExyzα", 4),
),
)
def test_levenshtein_distance(left: str, right: str, expected: int):
assert strings.levenshtein_distance(left, right) == expected


@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
def test_levenshtein_distance_equal_strings(random_string: str):
"""Mini fuzz test with different strings."""
assert strings.levenshtein_distance(random_string, random_string) == 0


@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
def test_levenshtein_distance_random_deletion(random_string: str):
"""Mini fuzz test with different strings."""

num_deleted = random.randrange(1, min(10, len(random_string)))
assert 1 <= num_deleted < len(random_string)

deleted = random_string
for _ in range(num_deleted):
deleted = random_deletion(deleted)

assert deleted != random_string
assert len(deleted) == len(random_string) - num_deleted
assert strings.levenshtein_distance(random_string, deleted) == num_deleted


@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
def test_levenshtein_distance_random_insertion(random_string: str):
"""Mini fuzz test with different strings."""

num_inserted = random.randrange(1, min(10, len(random_string)))
assert 1 <= num_inserted < len(random_string)

inserted = random_string
for _ in range(num_inserted):
inserted = random_insertion(inserted)

assert inserted != random_string
assert len(inserted) == len(random_string) + num_inserted
assert strings.levenshtein_distance(random_string, inserted) == num_inserted


@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
def test_levenshtein_distance_random_substitution(random_string: str):
"""Mini fuzz test with different strings.

Note: we don't do multiple substitutions here to avoid accidentally
substituting the same character twice.
"""
char_index = random.randrange(len(random_string))
replaced_char = random_string[char_index]
random_char = random_char_not_equal(replaced_char)
substituted = (
random_string[:char_index] + random_char + random_string[char_index + 1 :]
)
assert substituted != random_string
assert len(substituted) == len(random_string)
assert strings.levenshtein_distance(random_string, substituted) == 1