Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,14 @@ endif

ifndef OMIT_SIMD
ifeq ($(shell uname -sm),Darwin x86_64)
CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
CFLAGS += -DSQLITE_VEC_ENABLE_AVX
endif
ifeq ($(shell uname -sm),Darwin arm64)
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
endif
ifeq ($(shell uname -s),Linux)
ifeq ($(shell uname -sm),Linux x86_64)
ifeq ($(findstring android,$(CC)),)
ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),)
CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
endif
CFLAGS += -DSQLITE_VEC_ENABLE_AVX
endif
endif
endif
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/micro/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ cc = "1.0.99"
[[bench]]
name = "my_benchmark"
harness = false

[[bench]]
name = "simd_dispatch"
harness = false
102 changes: 102 additions & 0 deletions benchmarks/micro/benches/simd_dispatch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Benchmarks for the SIMD distance-dispatch paths (issue #302).
//
// Two benches:
// distance/l2_float_d1536 — calls vec_distance_l2() directly via SQL scalar
// function; one distance computation per iteration,
// no KNN planner overhead. Tightest proxy for the
// AVX2 l2_sqr_float_avx kernel.
// knn/n5000_d1536 — end-to-end KNN query over 5 000 vectors at d=1536.
// Setup is paid once outside b.iter; each iteration
// is a single query that exercises the distance
// dispatch loop ~5 000 times.
//
// Run:
// cargo bench --bench simd_dispatch
//
// To capture a baseline before the SIMD dispatch fix:
// cargo bench --bench simd_dispatch 2>&1 | tee /tmp/bench-before-simd.txt
// After applying the fix, run again; Criterion will print a regression/improvement
// line for each bench.

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use micro::init_vec;
use rand::Rng;
use rusqlite::Connection;
use zerocopy::AsBytes;

fn random_vector(n: usize) -> Vec<f32> {
let mut rng = rand::thread_rng();
(0..n).map(|_| rng.gen()).collect()
}

fn setup_knn_db(d: usize, n: usize) -> Connection {
let mut db = Connection::open_in_memory().unwrap();
db.execute(
format!("create virtual table v using vec0(a float[{d}])").as_str(),
[],
)
.unwrap();
let tx = db.transaction().unwrap();
for _ in 0..n {
let vec = random_vector(d);
tx.execute("insert into v(a) values (?)", [vec.as_bytes()])
.unwrap();
}
tx.commit().unwrap();
db
}

fn bench_distance_l2(c: &mut Criterion) {
init_vec();
let db = Connection::open_in_memory().unwrap();
let a = random_vector(1536);
let b = random_vector(1536);
let mut stmt = db.prepare("select vec_distance_l2(?, ?)").unwrap();

let mut group = c.benchmark_group("distance");
group.bench_function("l2_float_d1536", |bench| {
bench.iter(|| {
let _: f64 = stmt
.query_row(rusqlite::params![a.as_bytes(), b.as_bytes()], |r| r.get(0))
.unwrap();
});
});
group.finish();
}

fn bench_knn(c: &mut Criterion) {
init_vec();
let d = 1536;
let n = 5_000;
let k = 10;

let mut group = c.benchmark_group("knn");
for page_size in [4096usize, 8192, 16384] {
let db = setup_knn_db(d, n);
let query = random_vector(d);
let mut stmt = db
.prepare("select rowid, distance from v where a match ? order by distance limit ?")
.unwrap();

group.bench_with_input(
BenchmarkId::new(format!("n{n}_d{d}"), page_size),
&page_size,
|b, _| {
b.iter(|| {
let results: Vec<(i64, f64)> = stmt
.query_map(rusqlite::params![query.as_bytes(), k], |r| {
Ok((r.get(0).unwrap(), r.get(1).unwrap()))
})
.unwrap()
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(results.len(), k);
});
},
);
}
group.finish();
}

criterion_group!(benches, bench_distance_l2, bench_knn);
criterion_main!(benches);
2 changes: 2 additions & 0 deletions benchmarks/micro/build.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
fn main() {
cc::Build::new()
.file("../../sqlite-vec.c")
.include("../../vendor")
.include("../../")
.compile("sqlite_vec0");
}
10 changes: 8 additions & 2 deletions sqlite-vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ enum VectorElementType {
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PORTABLE_ALIGN64 __attribute__((aligned(64)))

__attribute__((target("avx,avx2")))
static f32 l2_sqr_float_avx(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
Expand Down Expand Up @@ -417,7 +418,9 @@ static f32 distance_l2_sqr_float(const void *a, const void *b, const void *d) {
}
#endif
#ifdef SQLITE_VEC_ENABLE_AVX
if (((*(const size_t *)d) % 16 == 0)) {
static int has_avx2 = -1;
if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2");
if (has_avx2 && ((*(const size_t *)d) % 16 == 0)) {
return l2_sqr_float_avx(a, b, d);
}
#endif
Expand Down Expand Up @@ -713,6 +716,7 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) {
* AVX2 Hamming distance using VPSHUFB-based popcount.
* Processes 32 bytes (256 bits) per iteration.
*/
__attribute__((target("avx2")))
static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) {
const u8 *pEnd = a + n_bytes;

Expand Down Expand Up @@ -815,7 +819,9 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) {
}
#endif
#ifdef SQLITE_VEC_ENABLE_AVX
if (n_bytes >= 32) {
static int has_avx2 = -1;
if (has_avx2 < 0) has_avx2 = __builtin_cpu_supports("avx2");
if (has_avx2 && n_bytes >= 32) {
return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes);
}
#endif
Expand Down