diff --git a/Makefile b/Makefile index 175ab16..84b72f2 100644 --- a/Makefile +++ b/Makefile @@ -37,16 +37,14 @@ endif ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin x86_64) - CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -DSQLITE_VEC_ENABLE_AVX endif ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif - ifeq ($(shell uname -s),Linux) + ifeq ($(shell uname -sm),Linux x86_64) ifeq ($(findstring android,$(CC)),) - ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) - CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX - endif + CFLAGS += -DSQLITE_VEC_ENABLE_AVX endif endif endif diff --git a/benchmarks/micro/Cargo.toml b/benchmarks/micro/Cargo.toml index ffc1f09..40633f6 100644 --- a/benchmarks/micro/Cargo.toml +++ b/benchmarks/micro/Cargo.toml @@ -17,3 +17,7 @@ cc = "1.0.99" [[bench]] name = "my_benchmark" harness = false + +[[bench]] +name = "simd_dispatch" +harness = false diff --git a/benchmarks/micro/benches/simd_dispatch.rs b/benchmarks/micro/benches/simd_dispatch.rs new file mode 100644 index 0000000..64d67c2 --- /dev/null +++ b/benchmarks/micro/benches/simd_dispatch.rs @@ -0,0 +1,102 @@ +// Benchmarks for the SIMD distance-dispatch paths (issue #302). +// +// Two benches: +// distance/l2_float_d1536 — calls vec_distance_l2() directly via SQL scalar +// function; one distance computation per iteration, +// no KNN planner overhead. Tightest proxy for the +// AVX2 l2_sqr_float_avx kernel. +// knn/n5000_d1536 — end-to-end KNN query over 5 000 vectors at d=1536. +// Setup is paid once outside b.iter; each iteration +// is a single query that exercises the distance +// dispatch loop ~5 000 times. +// +// Run: +// cargo bench --bench simd_dispatch +// +// To capture a baseline before the SIMD dispatch fix: +// cargo bench --bench simd_dispatch 2>&1 | tee /tmp/bench-before-simd.txt +// After applying the fix, run again; Criterion will print a regression/improvement +// line for each bench. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use micro::init_vec; +use rand::Rng; +use rusqlite::Connection; +use zerocopy::AsBytes; + +fn random_vector(n: usize) -> Vec { + let mut rng = rand::thread_rng(); + (0..n).map(|_| rng.gen()).collect() +} + +fn setup_knn_db(d: usize, n: usize) -> Connection { + let mut db = Connection::open_in_memory().unwrap(); + db.execute( + format!("create virtual table v using vec0(a float[{d}])").as_str(), + [], + ) + .unwrap(); + let tx = db.transaction().unwrap(); + for _ in 0..n { + let vec = random_vector(d); + tx.execute("insert into v(a) values (?)", [vec.as_bytes()]) + .unwrap(); + } + tx.commit().unwrap(); + db +} + +fn bench_distance_l2(c: &mut Criterion) { + init_vec(); + let db = Connection::open_in_memory().unwrap(); + let a = random_vector(1536); + let b = random_vector(1536); + let mut stmt = db.prepare("select vec_distance_l2(?, ?)").unwrap(); + + let mut group = c.benchmark_group("distance"); + group.bench_function("l2_float_d1536", |bench| { + bench.iter(|| { + let _: f64 = stmt + .query_row(rusqlite::params![a.as_bytes(), b.as_bytes()], |r| r.get(0)) + .unwrap(); + }); + }); + group.finish(); +} + +fn bench_knn(c: &mut Criterion) { + init_vec(); + let d = 1536; + let n = 5_000; + let k = 10; + + let mut group = c.benchmark_group("knn"); + for page_size in [4096usize, 8192, 16384] { + let db = setup_knn_db(d, n); + let query = random_vector(d); + let mut stmt = db + .prepare("select rowid, distance from v where a match ? order by distance limit ?") + .unwrap(); + + group.bench_with_input( + BenchmarkId::new(format!("n{n}_d{d}"), page_size), + &page_size, + |b, _| { + b.iter(|| { + let results: Vec<(i64, f64)> = stmt + .query_map(rusqlite::params![query.as_bytes(), k], |r| { + Ok((r.get(0).unwrap(), r.get(1).unwrap())) + }) + .unwrap() + .collect::, _>>() + .unwrap(); + assert_eq!(results.len(), k); + }); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_distance_l2, bench_knn); +criterion_main!(benches); diff --git a/benchmarks/micro/build.rs b/benchmarks/micro/build.rs index 003a6e3..1aab68b 100644 --- a/benchmarks/micro/build.rs +++ b/benchmarks/micro/build.rs @@ -1,5 +1,7 @@ fn main() { cc::Build::new() .file("../../sqlite-vec.c") + .include("../../vendor") + .include("../../") .compile("sqlite_vec0"); } diff --git a/sqlite-vec.c b/sqlite-vec.c index 7af3b6a..fd1ad9a 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -86,6 +86,7 @@ enum VectorElementType { #define PORTABLE_ALIGN32 __attribute__((aligned(32))) #define PORTABLE_ALIGN64 __attribute__((aligned(64))) +__attribute__((target("avx,avx2"))) static f32 l2_sqr_float_avx(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { f32 *pVect1 = (f32 *)pVect1v; @@ -417,7 +418,9 @@ static f32 distance_l2_sqr_float(const void *a, const void *b, const void *d) { } #endif #ifdef SQLITE_VEC_ENABLE_AVX - if (((*(const size_t *)d) % 16 == 0)) { + static int has_avx2 = -1; + if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2"); + if (has_avx2 && ((*(const size_t *)d) % 16 == 0)) { return l2_sqr_float_avx(a, b, d); } #endif @@ -713,6 +716,7 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { * AVX2 Hamming distance using VPSHUFB-based popcount. * Processes 32 bytes (256 bits) per iteration. */ +__attribute__((target("avx2"))) static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) { const u8 *pEnd = a + n_bytes; @@ -815,7 +819,9 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) { } #endif #ifdef SQLITE_VEC_ENABLE_AVX - if (n_bytes >= 32) { + static int has_avx2 = -1; + if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2"); + if (has_avx2 && n_bytes >= 32) { return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes); } #endif