NVIDIA · jinsolp · May 19, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
@@ -10,6 +10,8 @@
 import warnings
 
 from .utils import (
+    add_jitter,
+    is_l2_normalized,
     groundtruth_neighbors_filename,
     memmap_bin_file,
     offset_neighbor_indices,
@@ -111,6 +113,25 @@ def choose_random_queries(dataset, n_queries):
     return dataset[query_idx, :]
 
 
+def choose_random_queries_with_jitter(dataset, n_queries, seed=12345):
+    """Pick ``n_queries`` random rows from ``dataset``, add Gaussian jitter at
+    scale ``0.1 * std(sample)``, and re-normalize to unit norm iff the
+    original dataset rows already are.
+    """
+    import numpy as _np
+
+    print("Choosing random vectors from dataset and jittering with noise")
+    rng = _np.random.default_rng(seed)
+    n_rows = dataset.shape[0]
+    # Sort indices so the memmap read is sequential rather than random-access.
+    query_idx = _np.sort(rng.choice(n_rows, size=n_queries, replace=False))
+    sampled = dataset[query_idx, :].astype(_np.float32, copy=True)
+
+    normalize = is_l2_normalized(sampled)
+
+    return add_jitter(sampled, rng, normalize)
+
+
 def cpu_search(dataset, queries, k, metric="squeclidean"):
     """
     Find the k nearest neighbors for each query point in the dataset using the
@@ -235,18 +256,22 @@ def main():
         "The input and output files are in big-ann-benchmark's binary format.",
         epilog="""Example usage
     # With existing query file
-    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
-fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+    python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
+--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
 
     # With randomly generated queries
-    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
-fbin --output=groundtruth_dir --queries=random --n_queries=10000
+    python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
+--output=groundtruth_dir --queries=random --n_queries=10000
 
     # Using only a subset of the dataset. Define queries by randomly
     # selecting vectors from the (subset of the) dataset.
-    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
-fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
+    python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
+--rows=2000000 --cols=128 --output=groundtruth_dir \
 --queries=random-choice --n_queries=10000
+
+    # Jittered queries (following the logic of cuvs_bench.synthesize_dataset)
+    python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
+--output=groundtruth_dir --queries=random-jitter --n_queries=10000
     """,
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
@@ -256,9 +281,12 @@ def main():
         "--queries",
         type=str,
         default="random",
-        help="Queries file name, or one of 'random-choice' or 'random' "
-        "(default). 'random-choice': select n_queries vectors from the input "
-        "dataset. 'random': generate n_queries as uniform random numbers.",
+        help="Queries file name, or one of 'random-choice', 'random-jitter', "
+        "or 'random' (default). 'random-choice': select n_queries vectors "
+        "from the input dataset. 'random-jitter': same as 'random-choice', "
+        "but add std-relative Gaussian noise to each query and re-normalize "
+        "if the dataset rows are unit-norm. 'random': generate n_queries "
+        "as uniform random numbers.",
     )
     parser.add_argument(
         "--output",
@@ -341,7 +369,7 @@ def main():
     if len(args.output) > 0:
         os.makedirs(args.output, exist_ok=True)
 
-    if args.queries == "random" or args.queries == "random-choice":
+    if args.queries in {"random", "random-choice", "random-jitter"}:
         if args.n_queries is None:
             raise RuntimeError(
                 "n_queries must be given to generate random queries"
@@ -352,6 +380,10 @@ def main():
             )
         elif args.queries == "random-choice":
             queries = choose_random_queries(dataset, args.n_queries)
+        elif args.queries == "random-jitter":
+            queries = choose_random_queries_with_jitter(
+                dataset, args.n_queries
+            )
 
         queries_filename = os.path.join(
             args.output, "queries" + suffix_from_dtype(dtype)

@@ -10,6 +10,43 @@
 from cuvs_bench._bin_format import read_bin_header, write_bin_header
 
 
+def is_l2_normalized(
+    data,
+    sample_size: int = 10_000,
+    tol: float = 1e-2,
+    seed: int = 0,
+) -> bool:
+    """Cheaply check whether ``data`` rows are L2-unit-norm.
+
+    Samples up to ``sample_size`` rows uniformly at random and returns ``True``
+    iff every sampled row has ``|‖x‖ - 1| < tol``.
+    """
+    n = len(data)
+    if n == 0:
+        return False
+    rng = np.random.default_rng(seed)
+    take = min(sample_size, n)
+    idx = rng.choice(n, size=take, replace=False)
+    norms = np.linalg.norm(data[idx].astype(np.float32), axis=1)
+    return bool(np.all(np.abs(norms - 1.0) < tol))
+
+
+def add_jitter(
+    queries: np.ndarray,
+    rng: np.random.Generator,
+    normalize: bool,
+) -> np.ndarray:
+    """Add Gaussian jitter to query vectors and optionally re-normalize."""
+    noise_scale = float(np.std(queries)) * 0.1
+    queries = queries + rng.normal(0, noise_scale, queries.shape).astype(
+        np.float32
+    )
+    if normalize:
+        norms = np.linalg.norm(queries, axis=1, keepdims=True)
+        queries = queries / np.maximum(norms, 1e-8)
+    return queries.astype(np.float32)
+
+
 def dtype_from_filename(filename):
     ext = os.path.splitext(filename)[1]
     if ext == ".fbin":