Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
endif()
add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
endif()
12 changes: 11 additions & 1 deletion backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
DISPATCH_ORDER_OK=1
UPDATE_CACHE_DIR="/tmp/update_cache"
UPDATE_CACHE_OK=1
INDEX_DIR="/tmp/index"
INDEX_OK=1
EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
Expand Down Expand Up @@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}')
export_update_cache_negative('${UPDATE_CACHE_DIR}')
" || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models
export_all_index_models('${INDEX_DIR}')
" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }

# Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
# webgpu_native_test below (precise per-config error), so don't exit/mask here.
$PYTHON_EXECUTABLE -c "
Expand Down Expand Up @@ -136,7 +143,7 @@ cmake \
"${EXECUTORCH_ROOT}"

# ── Build + run every native test target that exists in this tree ────────────
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
BIN_DIR="${BUILD_DIR}/backends/webgpu"

# Which targets are defined depends on which diffs are landed (native_test +
Expand Down Expand Up @@ -201,6 +208,9 @@ fi
if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
"${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
fi
if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
"${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"

echo "=== WebGPU native tests on Dawn: all run targets passed ==="
Expand Down
13 changes: 13 additions & 0 deletions backends/webgpu/test/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,19 @@ python_unittest(
],
)

python_unittest(
name = "test_index",
srcs = [
"ops/index/test_index.py",
],
deps = [
"//caffe2:torch",
"//executorch/backends/vulkan/partitioner:vulkan_partitioner",
"//executorch/backends/vulkan:vulkan_preprocess",
"//executorch/exir:lib",
],
)

runtime.python_library(
name = "tester",
srcs = ["tester.py"],
Expand Down
174 changes: 174 additions & 0 deletions backends/webgpu/test/native/test_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

using namespace executorch::backends::webgpu;
using namespace executorch::extension;
using namespace executorch::runtime;

namespace {

// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case).
constexpr const char* kIndexCases[] = {
"index_n16_m5",
"index_n8_rev",
"index_n32_m3",
"index_n4_rep",
};

std::vector<float> read_f32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const size_t bytes =
static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
f.seekg(0);
std::vector<float> data(bytes / sizeof(float));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(bytes));
return data;
}

std::vector<int32_t> read_i32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const size_t bytes =
static_cast<size_t>(f.tellg()) / sizeof(int32_t) * sizeof(int32_t);
f.seekg(0);
std::vector<int32_t> data(bytes / sizeof(int32_t));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(bytes));
return data;
}

bool run_case(const std::string& dir, const char* name) {
printf("\n--- Test: %s ---\n", name);
const std::string base = dir + "/" + name;
std::vector<float> self_data = read_f32_bin(base + ".self.bin");
std::vector<int32_t> idx32 = read_i32_bin(base + ".idx.bin");
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
if (self_data.empty() || idx32.empty() || golden.empty()) {
printf("FAIL: could not read self/idx/golden for %s\n", name);
return false;
}

Module module(base + ".pte");
if (module.load_forward() != Error::Ok) {
printf("FAIL: could not load %s.pte\n", name);
return false;
}

const int32_t n = static_cast<int32_t>(self_data.size());
const int32_t m = static_cast<int32_t>(idx32.size());
auto x = make_tensor_ptr({n}, std::vector<float>(self_data));
// int64 at the program boundary; copy_inputs narrows to the int32 buffer.
std::vector<int64_t> idx64(idx32.begin(), idx32.end());
auto idx = make_tensor_ptr({m}, std::vector<int64_t>(idx64));

auto result = module.forward({EValue(x), EValue(idx)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}

const auto& outputs = result.get();
// index.Tensor has exactly one output of shape [num_indices]; fail loud else.
if (outputs.size() != 1 || !outputs[0].isTensor()) {
printf("FAIL: expected exactly one tensor output\n");
return false;
}
const auto& out_tensor = outputs[0].toTensor();
if (out_tensor.dim() != 1 || out_tensor.size(0) != m) {
printf(
"FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n",
(int)out_tensor.dim(),
(int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1),
m);
return false;
}
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
printf(
"FAIL: output numel %zu != golden %zu\n",
(size_t)out_tensor.numel(),
golden.size());
return false;
}
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f;
float max_rel_err = 0.0f;
for (size_t i = 0; i < golden.size(); i++) {
const float abs_err = std::abs(out_data[i] - golden[i]);
max_abs_err = std::max(max_abs_err, abs_err);
const float denom = std::max(std::abs(golden[i]), 1e-6f);
max_rel_err = std::max(max_rel_err, abs_err / denom);
}
printf(
"Max abs error: %e Max rel error: %e (%zu elements)\n",
max_abs_err,
max_rel_err,
golden.size());
if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
printf("FAIL: %s exceeds tolerance 1e-3\n", name);
return false;
}
printf("PASS: %s\n", name);
return true;
}

} // namespace

int main(int argc, char** argv) {
std::string dir = "/tmp/index";
if (argc > 1) {
dir = argv[1];
}
if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) {
dir = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
} catch (const std::exception& e) {
printf("SKIP: %s\n", e.what());
return 0;
}
set_default_webgpu_context(&ctx);
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());

bool ok = true;
for (const char* name : kIndexCases) {
ok = run_case(dir, name) && ok;
}

set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

if (!ok) {
return 1;
}
printf("\nAll index tests passed\n");
return 0;
}
Empty file.
106 changes: 106 additions & 0 deletions backends/webgpu/test/ops/index/test_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""`aten.index.Tensor` export + goldens for the WebGPU backend.

Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner --
the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see
op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index
serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat
indices make a wrong-gather bug visible. Each config writes `index_<name>.pte`,
`index_<name>.self.bin` (fp32 self), `index_<name>.idx.bin` (int32 index), and
`index_<name>.golden.bin` so the native `test_index` self-discovers them.
"""

import os
import unittest

import torch

from executorch.backends.vulkan import VulkanPartitioner
from executorch.exir import to_edge_transform_and_lower

# name -> (self_len, index_values)
CONFIGS = {
"n16_m5": (16, [0, 15, 7, 7, 2]),
"n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]),
"n32_m3": (32, [31, 0, 16]),
"n4_rep": (4, [2, 2, 2, 2, 0, 1]),
}


class IndexModule(torch.nn.Module):
def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
return x[idx]


def _inputs(self_len, index_values):
# Distinct self values so a wrong-index gather is visible.
x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5
idx = torch.tensor(index_values, dtype=torch.int64)
return x, idx


def _lower(x, idx):
ep = torch.export.export(IndexModule().eval(), (x, idx))
return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])


def _export(x, idx):
return _lower(x, idx).to_executorch()


def _delegated(et) -> bool:
return any(
d.id == "VulkanBackend"
for plan in et.executorch_program.execution_plan
for d in plan.delegates
)


def _op_delegated(edge, op_substr: str) -> bool:
# op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
gm = edge.exported_program().graph_module
return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)


class TestIndex(unittest.TestCase):
def test_export_delegates(self) -> None:
for name, (n, iv) in CONFIGS.items():
edge = _lower(*_inputs(n, iv))
et = edge.to_executorch()
self.assertTrue(
_delegated(et), f"Expected a VulkanBackend delegate (index {name})"
)
self.assertTrue(
_op_delegated(edge, "index.Tensor"),
f"index.Tensor not delegated (fell back to CPU) for {name}",
)

def test_golden_matches_eager(self) -> None:
for _, (n, iv) in CONFIGS.items():
x, idx = _inputs(n, iv)
torch.testing.assert_close(IndexModule()(x, idx), x[idx])


def export_all_index_models(out_dir: str) -> None:
"""Write index_<name>.pte + .self/.idx/.golden.bin for every config."""
os.makedirs(out_dir, exist_ok=True)
for name, (n, iv) in CONFIGS.items():
x, idx = _inputs(n, iv)
golden = x[idx].contiguous().detach().numpy().astype("<f4")
et = _export(x, idx)
base = os.path.join(out_dir, f"index_{name}")
with open(base + ".pte", "wb") as f:
f.write(et.buffer)
x.numpy().astype("<f4").tofile(base + ".self.bin")
idx.numpy().astype("<i4").tofile(base + ".idx.bin")
golden.tofile(base + ".golden.bin")
print(f"Exported {base}.pte; self {n} -> golden {golden.size} floats")


if __name__ == "__main__":
unittest.main()
Loading