diff --git a/benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py b/benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py new file mode 100644 index 00000000000..4844d7ed53d --- /dev/null +++ b/benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py @@ -0,0 +1,549 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "matplotlib", +# ] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Sweep bits-vs-distortion for TurboQuant and plot the curves. + +Calls `vector-search-bench distortion` for each (dataset, bits) combination, parses the +table from stdout, and plots reconstruction MSE and pairwise cosine-error curves with +mean/median/max shown on a log-scaled y-axis. + +Each `--dataset` value may optionally pin a train layout with a colon, e.g. +`--dataset cohere-small-100k:single`, for datasets that host more than one layout. + +Usage: + uv run benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py \\ + --dataset sift-small-500k + uv run benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py \\ + --dataset sift-small-500k --dataset glove-small-100k --samples 8192 + uv run benchmarks/vector-search-bench/scripts/plot-turboquant-distortion.py \\ + --dataset cohere-small-100k:single --bits 1 2 3 4 5 6 7 8 \\ + --output /tmp/distortion.png +""" + +import argparse +import math +import re +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D +from matplotlib.ticker import MaxNLocator + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_BINARY = REPO_ROOT / "target" / "release" / "vector-search-bench" + +METRIC_NAMES = [ + "reconstruction MSE mean", + "reconstruction MSE median", + "reconstruction MSE max", + "decoded cosine err mean", + "decoded cosine err median", + "decoded cosine err max", +] + + +@dataclass(frozen=True) +class DatasetTarget: + """One dataset to sweep, with the layout the bench should use for it.""" + + name: str + layout: str | None # `None` means let the bench auto-pick. + + +@dataclass +class Run: + target: DatasetTarget + dim: int + bits: int + values: dict[str, float] + + @property + def dataset(self) -> str: + return self.target.name + + +DIM_RE = re.compile(r"dim=(\d+)") + + +def parse_dataset_arg(spec: str, default_layout: str | None) -> DatasetTarget: + """Split a `name[:layout]` CLI value. `default_layout` fills in when no `:` is given.""" + if ":" in spec: + name, layout = spec.split(":", 1) + return DatasetTarget(name=name, layout=layout or None) + return DatasetTarget(name=spec, layout=default_layout) + + +def parse_dim(stdout: str) -> int: + """Pull `dim=N` out of the `## ...` header line.""" + match = DIM_RE.search(stdout) + if not match: + raise RuntimeError(f"could not find dim=N in header:\n{stdout}") + return int(match.group(1)) + + +def parse_table(stdout: str) -> dict[str, float]: + """Pull `metric -> value` rows out of the tabled stdout.""" + row_re = re.compile(r"│\s*(.+?)\s*│\s*([^│]+?)\s*│") + values: dict[str, float] = {} + for line in stdout.splitlines(): + match = row_re.match(line) + if not match: + continue + metric, value = match.group(1).strip(), match.group(2).strip() + if metric in METRIC_NAMES: + values[metric] = float(value) + missing = [m for m in METRIC_NAMES if m not in values] + if missing: + raise RuntimeError(f"could not parse metrics {missing} from:\n{stdout}") + return values + + +def run_one( + binary: Path, + target: DatasetTarget, + bits: int, + samples: int, + seed: int, + rounds: int, +) -> Run: + cmd = [ + str(binary), + "distortion", + "--dataset", + target.name, + "--bits", + str(bits), + "--samples", + str(samples), + "--seed", + str(seed), + "--rounds", + str(rounds), + ] + if target.layout: + cmd.extend(["--layout", target.layout]) + layout_tag = f" layout={target.layout}" if target.layout else "" + print(f" running {target.name}{layout_tag} @ bits={bits} ...", file=sys.stderr) + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return Run( + target=target, + dim=parse_dim(result.stdout), + bits=bits, + values=parse_table(result.stdout), + ) + + +def mse_bound_stage1(bits: int) -> float: + """Paper's normalized-MSE upper bound for TurboQuant_mse (Stage 1). + + From the Stage 1 theorem (`main.tex`, line 272): for a unit-norm vector `x` quantized + to `b` bits per coordinate, `E[||x - x'||^2] <= (sqrt(3)*pi/2) / 4^b`. Vortex ships + this stage, so the bound applies to the `reconstruction MSE mean` curve directly. + """ + return (math.sqrt(3.0) * math.pi / 2.0) / (4.0**bits) + + +def mse_bound_stage2(bits: int) -> float: + """Paper's normalized-MSE upper bound for TurboQuant_prod (Stage 2 = MSE + QJL). + + Stage 2 spends 1 bit per coordinate on the QJL residual correction, so the residual + MSE is the Stage-1 MSE evaluated at `b-1` bits, i.e. 4x looser than Stage 1. + """ + return (math.sqrt(3.0) * math.pi / 2.0) / (4.0 ** (bits - 1)) + + +def compression_ratio(bits: int, dim: int) -> float: + """Theoretical TurboQuant compression ratio vs f32 storage. + + Per the `vortex_tensor::encodings::turboquant` module docs, each vector is stored + as `padded_dim * bits / 8` bytes of quantized codes plus one f32 stored norm + (4 bytes), where `padded_dim` is the next power of two at least `dim`. The ratio is + nonlinear in `bits` because of POT padding and the per-vector norm overhead. + """ + padded_dim = 1 << (dim - 1).bit_length() if dim > 1 else 1 + per_vector_bytes = padded_dim * bits / 8.0 + 4.0 + original_bytes = dim * 4.0 + return original_bytes / per_vector_bytes + + +def cosine_bound(bits: int, dim: int) -> float: + """Paper's Stage-2 inner-product bound, rendered as an absolute-error envelope. + + From the Stage 2 theorem (`main.tex`, line 288): for unit y and an `x` quantized via + TurboQuant_prod (Stage 2, MSE + QJL residual), `E[| - |^2] <= + sqrt(3)*pi^2/d * 4^(-b)`. Taking sqrt gives an upper envelope on the RMS error per + bit width, and by Jensen also on the mean abs error. + + Caveat: Vortex currently implements only Stage 1 (no QJL residual correction). The + Stage 1 inner-product error is biased and can sit *above* this Stage-2 envelope. + """ + return math.pi * (3.0**0.25) / math.sqrt(dim) / (2.0**bits) + + +DATASET_PALETTE = [ + "#1f77b4", # blue + "#d62728", # red + "#2ca02c", # green + "#9467bd", # purple + "#ff7f0e", # orange + "#17becf", # teal + "#e377c2", # pink + "#8c564b", # brown + "#7f7f7f", # grey + "#bcbd22", # olive +] + +STAT_STYLES = [ + # (metric_suffix, label, linestyle, linewidth, marker) + ("mean", "mean", "-", 2.4, "o"), + ("max", "max", ":", 1.4, None), +] + + +def plot(runs: list[Run], args: argparse.Namespace) -> None: + by_dataset: dict[str, list[Run]] = {} + for r in runs: + by_dataset.setdefault(r.dataset, []).append(r) + for ds_runs in by_dataset.values(): + ds_runs.sort(key=lambda r: r.bits) + + plt.rcParams.update( + { + "font.size": 11, + "axes.titlesize": 13, + "axes.titleweight": "semibold", + "axes.labelsize": 11, + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.25, + "grid.linewidth": 0.6, + "legend.frameon": False, + } + ) + + fig, axes = plt.subplots(1, 3, figsize=(20, 6.5), constrained_layout=True) + fig.suptitle( + f"TurboQuant distortion vs bits per coordinate" + f" (samples={args.samples:,}, seed={args.seed}, rounds={args.rounds})", + fontsize=14, + fontweight="semibold", + ) + + dataset_colors = {ds: DATASET_PALETTE[i % len(DATASET_PALETTE)] for i, ds in enumerate(by_dataset)} + dataset_dims = {ds: ds_runs[0].dim for ds, ds_runs in by_dataset.items()} + + plot_panel( + axes[0], + by_dataset, + dataset_colors, + metric_prefix="reconstruction MSE", + title="Reconstruction MSE (per vector, normalized)", + ylabel=r"$\|x - x^\prime\|^2 / \|x\|^2$", + ) + bits_axis = sorted({r.bits for r in runs}) + axes[0].plot( + bits_axis, + [mse_bound_stage1(b) for b in bits_axis], + color="#222222", + linestyle=(0, (4, 2, 1, 2)), + linewidth=1.6, + zorder=0, + ) + + plot_panel( + axes[1], + by_dataset, + dataset_colors, + metric_prefix="decoded cosine err", + title=r"Pairwise cosine error $|\cos(x_i, x_j) - \cos(x_i^\prime, x_j^\prime)|$", + ylabel="absolute error", + ) + for dataset, ds_runs in by_dataset.items(): + color = dataset_colors[dataset] + d = ds_runs[0].dim + bits = sorted({r.bits for r in ds_runs}) + axes[1].plot( + bits, + [cosine_bound(b, d) for b in bits], + color=color, + linestyle=(0, (4, 2, 1, 2)), + linewidth=1.2, + alpha=0.6, + zorder=0, + ) + + plot_compression_panel(axes[2], by_dataset, dataset_colors) + + add_legends(fig, axes, dataset_colors, dataset_dims) + fig.text( + 0.5, + -0.015, + "Cosine bound is the paper's Stage-2 (TurboQuant_prod, MSE + QJL residual) " + "envelope; Vortex currently ships Stage 1 only, so empirical curves may sit " + "above it. Compression ratio is theoretical " + "(padded_dim * bits / 8 + 4 bytes per vector), excludes per-shard centroid " + "tables and file metadata.", + ha="center", + fontsize=9, + color="#555555", + wrap=True, + ) + + if args.output: + fig.savefig(args.output, dpi=140, bbox_inches="tight") + print(f"saved {args.output}", file=sys.stderr) + else: + plt.show() + + +def plot_panel( + ax, + by_dataset: dict[str, list[Run]], + dataset_colors: dict[str, str], + metric_prefix: str, + title: str, + ylabel: str, +) -> None: + for dataset, ds_runs in by_dataset.items(): + color = dataset_colors[dataset] + bits = [r.bits for r in ds_runs] + for stat_key, _label, linestyle, linewidth, marker in STAT_STYLES: + metric = f"{metric_prefix} {stat_key}" + ys = [r.values[metric] for r in ds_runs] + ax.plot( + bits, + ys, + color=color, + linestyle=linestyle, + linewidth=linewidth, + marker=marker, + markersize=6, + markerfacecolor=color, + markeredgecolor="white", + markeredgewidth=0.8, + alpha=0.95 if marker else 0.75, + ) + ax.set_yscale("log") + ax.set_xlabel("bits per coordinate") + ax.set_ylabel(ylabel) + ax.set_title(title) + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + ax.grid(True, which="major", linewidth=0.7, alpha=0.45) + ax.grid(True, which="minor", linewidth=0.4, alpha=0.22) + ax.minorticks_on() + + +def plot_compression_panel( + ax, + by_dataset: dict[str, list[Run]], + dataset_colors: dict[str, str], +) -> None: + bits_axis = sorted({r.bits for runs in by_dataset.values() for r in runs}) + for dataset, ds_runs in by_dataset.items(): + color = dataset_colors[dataset] + d = ds_runs[0].dim + padded = 1 << (d - 1).bit_length() if d > 1 else 1 + suffix = f" (padded {padded})" if padded != d else " (no padding)" + ax.plot( + bits_axis, + [compression_ratio(b, d) for b in bits_axis], + color=color, + linestyle="-", + linewidth=2.4, + marker="o", + markersize=6, + markerfacecolor=color, + markeredgecolor="white", + markeredgewidth=0.8, + label=f"{dataset}{suffix}", + ) + ax.set_xlabel("bits per coordinate") + ax.set_ylabel(r"ratio vs f32 (= $4d \,/\, (\mathrm{padded}\!\cdot\! b/8 + 4)$)") + ax.set_title("Compression ratio (theoretical)") + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + ax.grid(True, which="major", linewidth=0.7, alpha=0.45) + ax.grid(True, which="minor", linewidth=0.4, alpha=0.22) + ax.minorticks_on() + ax.legend( + title="dataset", + loc="upper right", + fontsize=9, + title_fontsize=10, + ) + + +def add_legends( + fig, + axes, + dataset_colors: dict[str, str], + dataset_dims: dict[str, int], +) -> None: + dataset_handles = [ + Line2D( + [], + [], + color=color, + linewidth=2.4, + marker="o", + markersize=6, + markerfacecolor=color, + markeredgecolor="white", + markeredgewidth=0.8, + label=f"{dataset} (d = {dataset_dims[dataset]})", + ) + for dataset, color in dataset_colors.items() + ] + stat_handles = [ + Line2D( + [], + [], + color="#333333", + linestyle=linestyle, + linewidth=linewidth, + marker=marker, + markersize=6 if marker else 0, + markerfacecolor="#333333", + markeredgecolor="white", + markeredgewidth=0.8, + label=label, + ) + for _, label, linestyle, linewidth, marker in STAT_STYLES + ] + mse_bound_handle_s1 = Line2D( + [], + [], + color="#222222", + linestyle=(0, (4, 2, 1, 2)), + linewidth=1.6, + label=r"paper bound: $D_{\mathrm{mse}} \leq \frac{\sqrt{3}\,\pi}{2}\, 4^{-b}$", + ) + cosine_bound_handle = Line2D( + [], + [], + color="#444444", + linestyle=(0, (4, 2, 1, 2)), + linewidth=1.2, + alpha=0.6, + label=( + r"paper Stage-2 bound: " + r"$\sqrt{D_{\mathrm{prod}}} \leq \frac{\pi\,3^{1/4}}{\sqrt{d}}\, 2^{-b}$" + ), + ) + + axes[0].legend( + handles=dataset_handles + [mse_bound_handle_s1], + title="dataset / bound", + loc="upper right", + fontsize=10, + title_fontsize=10, + ) + axes[1].legend( + handles=stat_handles + [cosine_bound_handle], + title="statistic / bound", + loc="upper right", + fontsize=10, + title_fontsize=10, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "--dataset", + action="append", + required=True, + help=( + "Dataset to sweep (repeat to compare multiple). Optionally suffix " + "`:layout` to pin a specific train layout for that dataset, e.g. " + "`--dataset cohere-small-100k:single`. If omitted, the bench picks " + "the dataset's only layout, or errors if there are several." + ), + ) + parser.add_argument( + "--layout", + default=None, + help=("Default train layout applied to any `--dataset` entry that doesn't pin its own with `:layout`."), + ) + parser.add_argument("--samples", type=int, default=65536) + parser.add_argument( + "--bits", + type=int, + nargs="+", + default=[1, 2, 3, 4, 5, 6, 7, 8], + help="Bit widths to sweep (default: 1..=8).", + ) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--rounds", type=int, default=3) + parser.add_argument( + "--binary", + type=Path, + default=DEFAULT_BINARY, + help=f"Path to vector-search-bench (default: {DEFAULT_BINARY}).", + ) + parser.add_argument( + "--output", + type=Path, + default=None, + help="If set, save the chart to this path instead of opening a window.", + ) + args = parser.parse_args() + + print("building vector-search-bench (release) ...", file=sys.stderr) + subprocess.run( + ["cargo", "build", "-p", "vector-search-bench", "--release"], + cwd=REPO_ROOT, + check=True, + ) + + if not args.binary.exists(): + sys.exit(f"binary not found at {args.binary} after build") + + targets = [parse_dataset_arg(spec, args.layout) for spec in args.dataset] + + runs: list[Run] = [] + for target in targets: + layout_tag = f" (layout={target.layout})" if target.layout else "" + print( + f"sweeping {target.name}{layout_tag} over bits {args.bits} ...", + file=sys.stderr, + ) + for bits in args.bits: + runs.append( + run_one( + args.binary, + target, + bits, + args.samples, + args.seed, + args.rounds, + ) + ) + + print_summary(runs) + plot(runs, args) + + +def print_summary(runs: list[Run]) -> None: + print() + print("Summary (one row per (dataset, bits)):") + header = ["dataset", "dim", "bits"] + METRIC_NAMES + widths = [max(len(h), 14) for h in header] + print(" " + " ".join(h.ljust(w) for h, w in zip(header, widths))) + for r in runs: + cells = [r.dataset, str(r.dim), str(r.bits)] + [f"{r.values[m]:.3e}" for m in METRIC_NAMES] + print(" " + " ".join(c.ljust(w) for c, w in zip(cells, widths))) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/vector-search-bench/src/distortion.rs b/benchmarks/vector-search-bench/src/distortion.rs new file mode 100644 index 00000000000..b1e7717389f --- /dev/null +++ b/benchmarks/vector-search-bench/src/distortion.rs @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! TurboQuant distortion measurement on real vector datasets. +//! +//! Reports per-vector normalized reconstruction error (`||x - x'||^2 / ||x||^2`) and pairwise +//! cosine-similarity error (`|cos(x_i, x_j) - cos(x'_i, x'_j)|`) after a full encode and decode +//! roundtrip through the [`vortex_tensor::encodings::turboquant`] scheme. This is the same +//! TurboQuant implementation the search subcommand stores on disk via +//! [`BtrBlocksCompressorBuilder::with_turboquant`](vortex_btrblocks::BtrBlocksCompressorBuilder). + +use std::io::Write; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use tabled::settings::Style; +use vortex::array::ArrayRef; +use vortex::array::ExecutionCtx; +use vortex::array::IntoArray; +use vortex::array::VortexSessionExecute; +use vortex::array::arrays::ExtensionArray; +use vortex::array::arrays::FixedSizeListArray; +use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::ScalarFnArray; +use vortex::array::arrays::Struct; +use vortex::array::arrays::StructArray; +use vortex::array::arrays::extension::ExtensionArrayExt; +use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; +use vortex::array::arrays::struct_::StructArrayExt; +use vortex_bench::conversions::parquet_to_vortex_chunks; +use vortex_bench::vector_dataset; +use vortex_bench::vector_dataset::TrainLayout; +use vortex_bench::vector_dataset::VectorDataset; +use vortex_tensor::encodings::turboquant::TurboQuantConfig; +use vortex_tensor::encodings::turboquant::turboquant_encode; +use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity; + +use crate::SESSION; +use crate::ingest::transform_chunk; + +/// Inputs to a distortion run. +#[derive(Debug, Clone)] +pub struct DistortionConfig { + /// Dataset to load vectors from. + pub dataset: VectorDataset, + /// Train-split layout (used to locate the local parquet shards). + pub layout: TrainLayout, + /// Bits per quantized coordinate. + pub bits: u8, + /// Seed for the SORF rotation. + pub seed: u64, + /// Number of sign-diagonal plus Walsh-Hadamard rounds in the SORF transform. + pub rounds: u8, + /// Number of base vectors to sample from the first train shard. + pub samples: usize, +} + +/// Mean, median, and max of a sample of distortion measurements. +#[derive(Debug, Clone)] +pub struct DistortionStats { + /// Arithmetic mean. + pub mean: f32, + /// Median (mid element after a partial sort). + pub median: f32, + /// Maximum. + pub max: f32, +} + +/// Per-dataset distortion report ready to render as markdown. +#[derive(Debug, Clone)] +pub struct DistortionReport { + /// Dataset the vectors came from. + pub dataset: VectorDataset, + /// Train-split layout used to locate the shard. + pub layout: TrainLayout, + /// Vector dimensionality. + pub dim: u32, + /// Bits per quantized coordinate. + pub bits: u8, + /// Seed for the SORF rotation. + pub seed: u64, + /// Number of SORF rounds. + pub rounds: u8, + /// Number of base vectors sampled. + pub samples: usize, + /// Per-vector normalized squared L2 reconstruction error. + pub reconstruction: DistortionStats, + /// Pairwise cosine-similarity error after decoding both sides. + pub decoded_cosine: DistortionStats, +} + +/// Compute reconstruction error and cosine-similarity error for a TurboQuant roundtrip. +pub async fn run_distortion(config: &DistortionConfig) -> Result { + let dataset = config.dataset; + let layout = config.layout; + + let paths = vector_dataset::download(dataset, layout) + .await + .with_context(|| format!("download {}", dataset.name()))?; + let train_path = paths + .train_files + .first() + .with_context(|| format!("dataset {} has no train shards", dataset.name()))? + .clone(); + + let mut ctx = SESSION.create_execution_ctx(); + + let chunked = parquet_to_vortex_chunks(train_path).await?; + let struct_array: StructArray = chunked.into_array().execute(&mut ctx)?; + let transformed = transform_chunk(struct_array.into_array(), &mut ctx)?; + let emb_full = transformed + .as_opt::() + .with_context(|| { + format!( + "transform_chunk did not return a Struct, got {}", + transformed.dtype() + ) + })? + .unmasked_field_by_name("emb") + .context("transformed chunk missing `emb` field")? + .clone(); + + let n = config.samples.min(emb_full.len()); + if n < 2 { + bail!( + "distortion: need at least 2 sampled vectors for cosine pairs, got {n} (dataset {})", + dataset.name(), + ); + } + let emb = emb_full.slice(0..n)?; + + let original = extract_flat_f32(&emb, &mut ctx)?; + let dim = pairs_per_row(&original, n)?; + + let tq_config = TurboQuantConfig { + bit_width: config.bits, + seed: config.seed, + num_rounds: config.rounds, + }; + let encoded = turboquant_encode(emb.clone(), &tq_config, &mut ctx)?; + let decoded_ext: ExtensionArray = encoded.execute(&mut ctx)?; + let decoded = decoded_ext.into_array(); + let decoded_flat = extract_flat_f32(&decoded, &mut ctx)?; + + let reconstruction = stats(&reconstruction_errors(&original, &decoded_flat, dim, n)); + + let half = n / 2; + let true_cosines = compute_cosines(emb.slice(0..half)?, emb.slice(half..2 * half)?, &mut ctx)?; + let decoded_cosines = compute_cosines( + decoded.slice(0..half)?, + decoded.slice(half..2 * half)?, + &mut ctx, + )?; + let decoded_cosine = stats(&abs_diff(&true_cosines, &decoded_cosines)); + + Ok(DistortionReport { + dataset, + layout, + dim: u32::try_from(dim).context("dim must fit in u32")?, + bits: config.bits, + seed: config.seed, + rounds: config.rounds, + samples: n, + reconstruction, + decoded_cosine, + }) +} + +/// Extract a flat `f32` slice from a `Vector` extension array. +fn extract_flat_f32(array: &ArrayRef, ctx: &mut ExecutionCtx) -> Result> { + let ext: ExtensionArray = array.clone().execute(ctx)?; + let fsl: FixedSizeListArray = ext.storage_array().clone().execute(ctx)?; + let elements: PrimitiveArray = fsl.elements().clone().execute(ctx)?; + Ok(elements.as_slice::().to_vec()) +} + +/// Compute one cosine per row over two equal-length tensor-like arrays. +fn compute_cosines(lhs: ArrayRef, rhs: ArrayRef, ctx: &mut ExecutionCtx) -> Result> { + let len = lhs.len(); + let sfn: ScalarFnArray = CosineSimilarity::try_new_array(lhs, rhs, len)?; + let prim: PrimitiveArray = sfn.into_array().execute(ctx)?; + Ok(prim.as_slice::().to_vec()) +} + +fn pairs_per_row(flat: &[f32], num_rows: usize) -> Result { + if num_rows == 0 { + bail!("distortion: cannot derive dim from zero rows"); + } + if !flat.len().is_multiple_of(num_rows) { + bail!( + "distortion: flat element count {} not divisible by row count {num_rows}", + flat.len(), + ); + } + Ok(flat.len() / num_rows) +} + +/// Per-vector normalized reconstruction MSE. Rows whose original squared norm is below `1e-10` +/// are dropped because their normalized error is numerically undefined. +fn reconstruction_errors( + original: &[f32], + reconstructed: &[f32], + dim: usize, + num_rows: usize, +) -> Vec { + let mut out = Vec::with_capacity(num_rows); + for row in 0..num_rows { + let start = row * dim; + let end = start + dim; + let orig = &original[start..end]; + let recon = &reconstructed[start..end]; + let norm_sq: f32 = orig.iter().map(|&v| v * v).sum(); + if norm_sq < 1e-10 { + continue; + } + let err_sq: f32 = orig + .iter() + .zip(recon.iter()) + .map(|(&a, &b)| (a - b) * (a - b)) + .sum(); + out.push(err_sq / norm_sq); + } + out +} + +fn abs_diff(lhs: &[f32], rhs: &[f32]) -> Vec { + lhs.iter() + .zip(rhs.iter()) + .map(|(&a, &b)| (a - b).abs()) + .collect() +} + +fn stats(samples: &[f32]) -> DistortionStats { + if samples.is_empty() { + return DistortionStats { + mean: f32::NAN, + median: f32::NAN, + max: f32::NAN, + }; + } + + let sum: f64 = samples.iter().map(|&v| f64::from(v)).sum(); + #[expect( + clippy::cast_possible_truncation, + reason = "casting an f64 mean back to f32 is intentional and matches the input precision" + )] + let mean = (sum / samples.len() as f64) as f32; + + let mut sorted = samples.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let mid = sorted.len() / 2; + let median = if sorted.len() % 2 == 1 { + sorted[mid] + } else { + 0.5 * (sorted[mid - 1] + sorted[mid]) + }; + + let max = samples.iter().copied().fold(f32::NEG_INFINITY, f32::max); + + DistortionStats { mean, median, max } +} + +impl DistortionReport { + /// Render the report as a markdown header line followed by a tabled table. + pub fn render(&self, writer: &mut dyn Write) -> Result<()> { + writeln!( + writer, + "## {} | dim={} | layout={} | bits={} | samples={} | seed={} | rounds={}", + self.dataset.name(), + self.dim, + self.layout.label(), + self.bits, + self.samples, + self.seed, + self.rounds, + )?; + + let rows: &[(&str, f32)] = &[ + ("reconstruction MSE mean", self.reconstruction.mean), + ("reconstruction MSE median", self.reconstruction.median), + ("reconstruction MSE max", self.reconstruction.max), + ("decoded cosine err mean", self.decoded_cosine.mean), + ("decoded cosine err median", self.decoded_cosine.median), + ("decoded cosine err max", self.decoded_cosine.max), + ]; + + let mut builder = tabled::builder::Builder::new(); + builder.push_record(["metric", "value"]); + for &(metric, value) in rows { + builder.push_record([metric.to_owned(), format_metric(value)]); + } + let mut table = builder.build(); + table.with(Style::modern()); + writeln!(writer, "{table}")?; + Ok(()) + } +} + +fn format_metric(value: f32) -> String { + if value.is_nan() { + "nan".to_owned() + } else if value == 0.0 { + "0".to_owned() + } else if value.abs() < 1e-3 || value.abs() >= 1e4 { + format!("{value:.3e}") + } else { + format!("{value:.6}") + } +} diff --git a/benchmarks/vector-search-bench/src/lib.rs b/benchmarks/vector-search-bench/src/lib.rs index 643cbb5bd0d..76b24390d09 100644 --- a/benchmarks/vector-search-bench/src/lib.rs +++ b/benchmarks/vector-search-bench/src/lib.rs @@ -5,6 +5,7 @@ pub mod compression; pub mod display; +pub mod distortion; pub mod expression; pub mod ingest; pub mod prepare; @@ -13,9 +14,12 @@ pub mod scan; use std::sync::LazyLock; +use anyhow::Result; use vortex::VortexSessionDefault; use vortex::io::session::RuntimeSessionExt; use vortex::session::VortexSession; +use vortex_bench::vector_dataset::TrainLayout; +use vortex_bench::vector_dataset::VectorDataset; pub static SESSION: LazyLock = LazyLock::new(|| { // SAFETY: called from inside the LazyLock initializer, before any other access to @@ -26,3 +30,38 @@ pub static SESSION: LazyLock = LazyLock::new(|| { vortex_tensor::initialize(&session); session }); + +/// Resolve a dataset's [`TrainLayout`]. +/// +/// Every benchmark has different sets of possible dataset layouts available. The user **must** +/// provide one if there are multiple layouts. But if a dataset only has 1 layout, we can choose +/// that for them as the default. +pub fn resolve_layout( + dataset: VectorDataset, + requested: Option, +) -> Result { + let layouts = dataset.layouts(); + + match requested { + Some(layout) => { + dataset.validate_layout(layout)?; + Ok(layout) + } + None => { + if layouts.len() == 1 { + Ok(layouts[0].layout()) + } else { + let allowed = layouts + .iter() + .map(|s| s.layout().label()) + .collect::>() + .join(", "); + anyhow::bail!( + "dataset {} hosts multiple layouts ([{}]): pass --layout to pick one", + dataset.name(), + allowed, + ); + } + } + } +} diff --git a/benchmarks/vector-search-bench/src/main.rs b/benchmarks/vector-search-bench/src/main.rs index 440de142bef..307c050d221 100644 --- a/benchmarks/vector-search-bench/src/main.rs +++ b/benchmarks/vector-search-bench/src/main.rs @@ -1,15 +1,20 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! `vector-search-bench` — on-disk cosine-similarity scan benchmark. +//! `vector-search-bench` benchmarks for cosine-similarity scan and TurboQuant distortion. //! //! ```sh -//! cargo run -p vector-search-bench --release -- \ +//! cargo run -p vector-search-bench --release -- search \ //! --dataset cohere-large-10m \ //! --layout partitioned \ //! --flavors vortex-uncompressed,vortex-turboquant \ //! --iterations 3 \ //! --threshold 0.8 +//! +//! cargo run -p vector-search-bench --release -- distortion \ +//! --dataset sift-small-500k \ +//! --bits 4 \ +//! --samples 4096 //! ``` use std::path::PathBuf; @@ -17,13 +22,17 @@ use std::path::PathBuf; use anyhow::Context; use anyhow::Result; use clap::Parser; +use clap::Subcommand; use vector_search_bench::compression::ALL_VECTOR_FLAVORS; use vector_search_bench::compression::VectorFlavor; use vector_search_bench::display::DatasetReport; use vector_search_bench::display::render; +use vector_search_bench::distortion::DistortionConfig; +use vector_search_bench::distortion::run_distortion; use vector_search_bench::prepare::CompressedVortexDataset; use vector_search_bench::prepare::prepare_all; use vector_search_bench::query::get_random_query_vector; +use vector_search_bench::resolve_layout; use vector_search_bench::scan::ScanConfig; use vector_search_bench::scan::ScanTiming; use vector_search_bench::scan::run_search_scan; @@ -35,7 +44,21 @@ use vortex_bench::vector_dataset::VectorDataset; #[derive(Parser, Debug)] #[command(version, about, long_about = None)] -struct Args { +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// On-disk cosine-similarity scan latency benchmark. + Search(SearchArgs), + /// TurboQuant distortion measurement: reconstruction error and cosine error. + Distortion(DistortionArgs), +} + +#[derive(Parser, Debug)] +struct SearchArgs { /// Dataset to benchmark. Single dataset per CLI invocation by design — large datasets /// are intentionally babysat one at a time. #[arg(long, value_enum)] @@ -86,9 +109,55 @@ struct Args { tracing: bool, } +#[derive(Parser, Debug)] +struct DistortionArgs { + /// Dataset to load vectors from. One dataset per invocation. + #[arg(long, value_enum)] + dataset: VectorDataset, + + /// Train-split layout. Required when the dataset publishes more than one layout. + #[arg(long, value_enum)] + layout: Option, + + /// Bits per quantized coordinate. + #[arg(long, default_value_t = 4)] + bits: u8, + + /// Seed for the SORF rotation. + #[arg(long, default_value_t = 42)] + seed: u64, + + /// Number of sign-diagonal plus Walsh-Hadamard rounds in the SORF transform. + #[arg(long, default_value_t = 3)] + rounds: u8, + + /// Number of base vectors to sample from the first train shard (first N rows). + #[arg(long, default_value_t = 65536)] + samples: usize, + + /// Optional path to write the rendered table to instead of stdout. + #[arg(long)] + output_path: Option, + + /// Emit verbose tracing. + #[arg(short, long)] + verbose: bool, + + /// Enable perfetto tracing output. + #[arg(long)] + tracing: bool, +} + #[tokio::main] async fn main() -> Result<()> { - let args = Args::parse(); + let cli = Cli::parse(); + match cli.command { + Command::Search(args) => run_search(args).await, + Command::Distortion(args) => run_distortion_cmd(args).await, + } +} + +async fn run_search(args: SearchArgs) -> Result<()> { setup_logging_and_tracing(args.verbose, args.tracing)?; let dataset = args.dataset; @@ -105,12 +174,10 @@ async fn main() -> Result<()> { anyhow::bail!("no flavors selected, please pass at least one to --flavors"); } - // Load the source embeddings parquet files. let datasets_paths = vector_dataset::download(dataset, layout) .await .with_context(|| format!("download {}", dataset.name()))?; - // Load all vortex files needed, compressing new ones if needed. let prepared = prepare_all(dataset, layout, &datasets_paths, &args.flavors).await?; let query_vector = get_random_query_vector( @@ -131,14 +198,12 @@ async fn main() -> Result<()> { threshold: args.threshold, }; - // Run all scans and record how long each takes. let mut scan_timings: Vec = Vec::with_capacity(prepared.len()); for prep in &prepared { let timing = run_search_scan(prep, &query_vector.query, &scan_config).await?; scan_timings.push(timing); } - // Collect the benchmark results. let pairs: Vec<(VectorFlavor, &CompressedVortexDataset, &ScanTiming)> = prepared .iter() .zip(scan_timings.iter()) @@ -149,8 +214,6 @@ async fn main() -> Result<()> { vortex_results: &pairs, }; - // Emit v3 JSONL if requested. The records carry the per-scan dimensions that - // ScanTiming itself does not (dataset, layout, threshold). if let Some(path) = args.gh_json_v3.as_ref() { let records: Vec = scan_timings .iter() @@ -179,7 +242,6 @@ async fn main() -> Result<()> { v3::write_jsonl_to_path(path, &records)?; } - // Print the results. if let Some(path) = args.output_path { let mut file = std::fs::File::create(&path).with_context(|| format!("create {}", path.display()))?; @@ -193,32 +255,30 @@ async fn main() -> Result<()> { Ok(()) } -/// Every benchmark has different sets of possible dataset layouts available. The user **must** -/// provide one if there are multiple layouts. But if a dataset only has 1 layout, we can choose -/// that for them as the default. -fn resolve_layout(dataset: VectorDataset, requested: Option) -> Result { - let layouts = dataset.layouts(); - - match requested { - Some(layout) => { - dataset.validate_layout(layout)?; - Ok(layout) - } - None => { - if layouts.len() == 1 { - Ok(layouts[0].layout()) - } else { - let allowed = layouts - .iter() - .map(|s| s.layout().label()) - .collect::>() - .join(", "); - anyhow::bail!( - "dataset {} hosts multiple layouts ([{}]): pass --layout to pick one", - dataset.name(), - allowed, - ); - } - } +async fn run_distortion_cmd(args: DistortionArgs) -> Result<()> { + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let layout = resolve_layout(args.dataset, args.layout)?; + let config = DistortionConfig { + dataset: args.dataset, + layout, + bits: args.bits, + seed: args.seed, + rounds: args.rounds, + samples: args.samples, + }; + + let report = run_distortion(&config).await?; + + if let Some(path) = args.output_path { + let mut file = + std::fs::File::create(&path).with_context(|| format!("create {}", path.display()))?; + report.render(&mut file)?; + } else { + let stdout = std::io::stdout(); + let mut handle = stdout.lock(); + report.render(&mut handle)?; } + + Ok(()) }