From 9b0c314207be2dca303dc0a9a4c3a46d1bae5177 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:36:23 -0700 Subject: [PATCH 1/3] Fix micro benchmark build: add vendor and root include paths build.rs was missing -I../../vendor and -I../../, so sqlite3ext.h could not be found when cc compiled sqlite-vec.c. --- benchmarks/micro/build.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/micro/build.rs b/benchmarks/micro/build.rs index 003a6e3..1aab68b 100644 --- a/benchmarks/micro/build.rs +++ b/benchmarks/micro/build.rs @@ -1,5 +1,7 @@ fn main() { cc::Build::new() .file("../../sqlite-vec.c") + .include("../../vendor") + .include("../../") .compile("sqlite_vec0"); } From b1c8b216c108dedb10eb0c79665a11b19e4b7de4 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:05:24 -0700 Subject: [PATCH 2/3] Fix AVX/AVX2 SIMD dispatch: use target attributes and runtime CPU detection - Add __attribute__((target("avx,avx2"))) to l2_sqr_float_avx and __attribute__((target("avx2"))) to distance_hamming_avx2, confining AVX2 codegen to those two functions only. - Add __builtin_cpu_supports("avx2") runtime guards (cached in a static local) at both dispatch sites so pre-AVX2 CPUs take the scalar path instead of SIGILL. - Drop -mavx -mavx2 from the Makefile entirely: those flags were applied file-wide, allowing the compiler to emit AVX2 instructions in unrelated functions (vec_eachOpen, vec0Open, etc.). - Simplify the Linux SIMD block: the /proc/cpuinfo grep is no longer needed now that the runtime check handles AVX2 availability, so the define is enabled unconditionally for Linux x86_64 (matching the Darwin x86_64 behaviour). Fixes #302. --- Makefile | 8 +++----- sqlite-vec.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 175ab16..84b72f2 100644 --- a/Makefile +++ b/Makefile @@ -37,16 +37,14 @@ endif ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin x86_64) - CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -DSQLITE_VEC_ENABLE_AVX endif ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif - ifeq ($(shell uname -s),Linux) + ifeq ($(shell uname -sm),Linux x86_64) ifeq ($(findstring android,$(CC)),) - ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) - CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX - endif + CFLAGS += -DSQLITE_VEC_ENABLE_AVX endif endif endif diff --git a/sqlite-vec.c b/sqlite-vec.c index 7af3b6a..fd1ad9a 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -86,6 +86,7 @@ enum VectorElementType { #define PORTABLE_ALIGN32 __attribute__((aligned(32))) #define PORTABLE_ALIGN64 __attribute__((aligned(64))) +__attribute__((target("avx,avx2"))) static f32 l2_sqr_float_avx(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { f32 *pVect1 = (f32 *)pVect1v; @@ -417,7 +418,9 @@ static f32 distance_l2_sqr_float(const void *a, const void *b, const void *d) { } #endif #ifdef SQLITE_VEC_ENABLE_AVX - if (((*(const size_t *)d) % 16 == 0)) { + static int has_avx2 = -1; + if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2"); + if (has_avx2 && ((*(const size_t *)d) % 16 == 0)) { return l2_sqr_float_avx(a, b, d); } #endif @@ -713,6 +716,7 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { * AVX2 Hamming distance using VPSHUFB-based popcount. * Processes 32 bytes (256 bits) per iteration. */ +__attribute__((target("avx2"))) static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) { const u8 *pEnd = a + n_bytes; @@ -815,7 +819,9 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) { } #endif #ifdef SQLITE_VEC_ENABLE_AVX - if (n_bytes >= 32) { + static int has_avx2 = -1; + if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2"); + if (has_avx2 && n_bytes >= 32) { return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes); } #endif From 280eba01ff5ff92185f3fec25cb8cc9e224acaab Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 11 Jun 2026 14:05:38 -0700 Subject: [PATCH 3/3] Add simd_dispatch benchmark for AVX2 distance dispatch paths Measures two things: - distance/l2_float_d1536: one vec_distance_l2() call per iteration, direct proxy for the AVX2 l2_sqr_float_avx kernel with no KNN overhead. - knn/n5000_d1536: end-to-end KNN query over 5 000 x 1536-dim vectors, setup paid once outside b.iter, three page sizes. Designed to capture before/after numbers for SIMD dispatch changes. n=5000 keeps peak memory well under 100 MB (two DBs alive at once in the original my_benchmark caused OOM at n=1M). --- benchmarks/micro/Cargo.toml | 4 + benchmarks/micro/benches/simd_dispatch.rs | 102 ++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 benchmarks/micro/benches/simd_dispatch.rs diff --git a/benchmarks/micro/Cargo.toml b/benchmarks/micro/Cargo.toml index ffc1f09..40633f6 100644 --- a/benchmarks/micro/Cargo.toml +++ b/benchmarks/micro/Cargo.toml @@ -17,3 +17,7 @@ cc = "1.0.99" [[bench]] name = "my_benchmark" harness = false + +[[bench]] +name = "simd_dispatch" +harness = false diff --git a/benchmarks/micro/benches/simd_dispatch.rs b/benchmarks/micro/benches/simd_dispatch.rs new file mode 100644 index 0000000..64d67c2 --- /dev/null +++ b/benchmarks/micro/benches/simd_dispatch.rs @@ -0,0 +1,102 @@ +// Benchmarks for the SIMD distance-dispatch paths (issue #302). +// +// Two benches: +// distance/l2_float_d1536 — calls vec_distance_l2() directly via SQL scalar +// function; one distance computation per iteration, +// no KNN planner overhead. Tightest proxy for the +// AVX2 l2_sqr_float_avx kernel. +// knn/n5000_d1536 — end-to-end KNN query over 5 000 vectors at d=1536. +// Setup is paid once outside b.iter; each iteration +// is a single query that exercises the distance +// dispatch loop ~5 000 times. +// +// Run: +// cargo bench --bench simd_dispatch +// +// To capture a baseline before the SIMD dispatch fix: +// cargo bench --bench simd_dispatch 2>&1 | tee /tmp/bench-before-simd.txt +// After applying the fix, run again; Criterion will print a regression/improvement +// line for each bench. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use micro::init_vec; +use rand::Rng; +use rusqlite::Connection; +use zerocopy::AsBytes; + +fn random_vector(n: usize) -> Vec { + let mut rng = rand::thread_rng(); + (0..n).map(|_| rng.gen()).collect() +} + +fn setup_knn_db(d: usize, n: usize) -> Connection { + let mut db = Connection::open_in_memory().unwrap(); + db.execute( + format!("create virtual table v using vec0(a float[{d}])").as_str(), + [], + ) + .unwrap(); + let tx = db.transaction().unwrap(); + for _ in 0..n { + let vec = random_vector(d); + tx.execute("insert into v(a) values (?)", [vec.as_bytes()]) + .unwrap(); + } + tx.commit().unwrap(); + db +} + +fn bench_distance_l2(c: &mut Criterion) { + init_vec(); + let db = Connection::open_in_memory().unwrap(); + let a = random_vector(1536); + let b = random_vector(1536); + let mut stmt = db.prepare("select vec_distance_l2(?, ?)").unwrap(); + + let mut group = c.benchmark_group("distance"); + group.bench_function("l2_float_d1536", |bench| { + bench.iter(|| { + let _: f64 = stmt + .query_row(rusqlite::params![a.as_bytes(), b.as_bytes()], |r| r.get(0)) + .unwrap(); + }); + }); + group.finish(); +} + +fn bench_knn(c: &mut Criterion) { + init_vec(); + let d = 1536; + let n = 5_000; + let k = 10; + + let mut group = c.benchmark_group("knn"); + for page_size in [4096usize, 8192, 16384] { + let db = setup_knn_db(d, n); + let query = random_vector(d); + let mut stmt = db + .prepare("select rowid, distance from v where a match ? order by distance limit ?") + .unwrap(); + + group.bench_with_input( + BenchmarkId::new(format!("n{n}_d{d}"), page_size), + &page_size, + |b, _| { + b.iter(|| { + let results: Vec<(i64, f64)> = stmt + .query_map(rusqlite::params![query.as_bytes(), k], |r| { + Ok((r.get(0).unwrap(), r.get(1).unwrap())) + }) + .unwrap() + .collect::, _>>() + .unwrap(); + assert_eq!(results.len(), k); + }); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_distance_l2, bench_knn); +criterion_main!(benches);