Skip to content
Open
52 changes: 42 additions & 10 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import warnings

from .utils import (
add_jitter,
is_l2_normalized,
groundtruth_neighbors_filename,
memmap_bin_file,
offset_neighbor_indices,
Expand Down Expand Up @@ -111,6 +113,25 @@ def choose_random_queries(dataset, n_queries):
return dataset[query_idx, :]


def choose_random_queries_with_jitter(dataset, n_queries, seed=12345):
"""Pick ``n_queries`` random rows from ``dataset``, add Gaussian jitter at
scale ``0.1 * std(sample)``, and re-normalize to unit norm iff the
original dataset rows already are.
"""
import numpy as _np

print("Choosing random vectors from dataset and jittering with noise")
rng = _np.random.default_rng(seed)
n_rows = dataset.shape[0]
# Sort indices so the memmap read is sequential rather than random-access.
query_idx = _np.sort(rng.choice(n_rows, size=n_queries, replace=False))
sampled = dataset[query_idx, :].astype(_np.float32, copy=True)

normalize = is_l2_normalized(sampled)

return add_jitter(sampled, rng, normalize)

Comment thread
jinsolp marked this conversation as resolved.

def cpu_search(dataset, queries, k, metric="squeclidean"):
"""
Find the k nearest neighbors for each query point in the dataset using the
Expand Down Expand Up @@ -235,18 +256,22 @@ def main():
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin

# With randomly generated queries
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=random --n_queries=10000
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random --n_queries=10000

# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--rows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000

# Jittered queries (following the logic of cuvs_bench.synthesize_dataset)
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random-jitter --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
Expand All @@ -256,9 +281,12 @@ def main():
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
help="Queries file name, or one of 'random-choice', 'random-jitter', "
"or 'random' (default). 'random-choice': select n_queries vectors "
"from the input dataset. 'random-jitter': same as 'random-choice', "
"but add std-relative Gaussian noise to each query and re-normalize "
"if the dataset rows are unit-norm. 'random': generate n_queries "
"as uniform random numbers.",
)
parser.add_argument(
"--output",
Expand Down Expand Up @@ -341,7 +369,7 @@ def main():
if len(args.output) > 0:
os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.queries in {"random", "random-choice", "random-jitter"}:
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
Expand All @@ -352,6 +380,10 @@ def main():
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)
elif args.queries == "random-jitter":
queries = choose_random_queries_with_jitter(
dataset, args.n_queries
)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
Expand Down
37 changes: 37 additions & 0 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,43 @@
from cuvs_bench._bin_format import read_bin_header, write_bin_header


def is_l2_normalized(
data,
sample_size: int = 10_000,
tol: float = 1e-2,
seed: int = 0,
) -> bool:
"""Cheaply check whether ``data`` rows are L2-unit-norm.

Samples up to ``sample_size`` rows uniformly at random and returns ``True``
iff every sampled row has ``|‖x‖ - 1| < tol``.
"""
n = len(data)
if n == 0:
return False
rng = np.random.default_rng(seed)
take = min(sample_size, n)
idx = rng.choice(n, size=take, replace=False)
norms = np.linalg.norm(data[idx].astype(np.float32), axis=1)
return bool(np.all(np.abs(norms - 1.0) < tol))


def add_jitter(
queries: np.ndarray,
rng: np.random.Generator,
normalize: bool,
) -> np.ndarray:
"""Add Gaussian jitter to query vectors and optionally re-normalize."""
noise_scale = float(np.std(queries)) * 0.1
queries = queries + rng.normal(0, noise_scale, queries.shape).astype(
np.float32
)
if normalize:
norms = np.linalg.norm(queries, axis=1, keepdims=True)
queries = queries / np.maximum(norms, 1e-8)
return queries.astype(np.float32)


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
Expand Down
Loading
Loading