diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index cbe36dade70..c3d9387942a 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST) target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions) set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17) endif() + add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp) endif() diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index b2eb56a505e..38195535732 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order" DISPATCH_ORDER_OK=1 UPDATE_CACHE_DIR="/tmp/update_cache" UPDATE_CACHE_OK=1 +INDEX_DIR="/tmp/index" +INDEX_OK=1 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte" EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin" EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin" @@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}') export_update_cache_negative('${UPDATE_CACHE_DIR}') " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; } +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models +export_all_index_models('${INDEX_DIR}') +" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; } + # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in # webgpu_native_test below (precise per-config error), so don't exit/mask here. $PYTHON_EXECUTABLE -c " @@ -136,7 +143,7 @@ cmake \ "${EXECUTORCH_ROOT}" # ── Build + run every native test target that exists in this tree ──────────── -TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) +TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test) BIN_DIR="${BUILD_DIR}/backends/webgpu" # Which targets are defined depends on which diffs are landed (native_test + @@ -201,6 +208,9 @@ fi if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" fi +if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then + "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}" +fi [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" echo "=== WebGPU native tests on Dawn: all run targets passed ===" diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS index 6bf76a525e9..d4f7046f0bd 100644 --- a/backends/webgpu/test/TARGETS +++ b/backends/webgpu/test/TARGETS @@ -17,6 +17,19 @@ python_unittest( ], ) +python_unittest( + name = "test_index", + srcs = [ + "ops/index/test_index.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan/partitioner:vulkan_partitioner", + "//executorch/backends/vulkan:vulkan_preprocess", + "//executorch/exir:lib", + ], +) + runtime.python_library( name = "tester", srcs = ["tester.py"], diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp new file mode 100644 index 00000000000..aed24c0a796 --- /dev/null +++ b/backends/webgpu/test/native/test_index.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; +using namespace executorch::extension; +using namespace executorch::runtime; + +namespace { + +// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case). +constexpr const char* kIndexCases[] = { + "index_n16_m5", + "index_n8_rev", + "index_n32_m3", + "index_n4_rep", +}; + +std::vector read_f32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(float) * sizeof(float); + f.seekg(0); + std::vector data(bytes / sizeof(float)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +std::vector read_i32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(int32_t) * sizeof(int32_t); + f.seekg(0); + std::vector data(bytes / sizeof(int32_t)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +bool run_case(const std::string& dir, const char* name) { + printf("\n--- Test: %s ---\n", name); + const std::string base = dir + "/" + name; + std::vector self_data = read_f32_bin(base + ".self.bin"); + std::vector idx32 = read_i32_bin(base + ".idx.bin"); + std::vector golden = read_f32_bin(base + ".golden.bin"); + if (self_data.empty() || idx32.empty() || golden.empty()) { + printf("FAIL: could not read self/idx/golden for %s\n", name); + return false; + } + + Module module(base + ".pte"); + if (module.load_forward() != Error::Ok) { + printf("FAIL: could not load %s.pte\n", name); + return false; + } + + const int32_t n = static_cast(self_data.size()); + const int32_t m = static_cast(idx32.size()); + auto x = make_tensor_ptr({n}, std::vector(self_data)); + // int64 at the program boundary; copy_inputs narrows to the int32 buffer. + std::vector idx64(idx32.begin(), idx32.end()); + auto idx = make_tensor_ptr({m}, std::vector(idx64)); + + auto result = module.forward({EValue(x), EValue(idx)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + // index.Tensor has exactly one output of shape [num_indices]; fail loud else. + if (outputs.size() != 1 || !outputs[0].isTensor()) { + printf("FAIL: expected exactly one tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (out_tensor.dim() != 1 || out_tensor.size(0) != m) { + printf( + "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n", + (int)out_tensor.dim(), + (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1), + m); + return false; + } + if (static_cast(out_tensor.numel()) != golden.size()) { + printf( + "FAIL: output numel %zu != golden %zu\n", + (size_t)out_tensor.numel(), + golden.size()); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + float max_abs_err = 0.0f; + float max_rel_err = 0.0f; + for (size_t i = 0; i < golden.size(); i++) { + const float abs_err = std::abs(out_data[i] - golden[i]); + max_abs_err = std::max(max_abs_err, abs_err); + const float denom = std::max(std::abs(golden[i]), 1e-6f); + max_rel_err = std::max(max_rel_err, abs_err / denom); + } + printf( + "Max abs error: %e Max rel error: %e (%zu elements)\n", + max_abs_err, + max_rel_err, + golden.size()); + if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) { + printf("FAIL: %s exceeds tolerance 1e-3\n", name); + return false; + } + printf("PASS: %s\n", name); + return true; +} + +} // namespace + +int main(int argc, char** argv) { + std::string dir = "/tmp/index"; + if (argc > 1) { + dir = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) { + dir = env; + } + + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str()); + + bool ok = true; + for (const char* name : kIndexCases) { + ok = run_case(dir, name) && ok; + } + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll index tests passed\n"); + return 0; +} diff --git a/backends/webgpu/test/ops/index/__init__.py b/backends/webgpu/test/ops/index/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py new file mode 100644 index 00000000000..6e008c5bd6f --- /dev/null +++ b/backends/webgpu/test/ops/index/test_index.py @@ -0,0 +1,106 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.index.Tensor` export + goldens for the WebGPU backend. + +Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner -- +the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see +op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index +serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat +indices make a wrong-gather bug visible. Each config writes `index_.pte`, +`index_.self.bin` (fp32 self), `index_.idx.bin` (int32 index), and +`index_.golden.bin` so the native `test_index` self-discovers them. +""" + +import os +import unittest + +import torch + +from executorch.backends.vulkan import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (self_len, index_values) +CONFIGS = { + "n16_m5": (16, [0, 15, 7, 7, 2]), + "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]), + "n32_m3": (32, [31, 0, 16]), + "n4_rep": (4, [2, 2, 2, 2, 0, 1]), +} + + +class IndexModule(torch.nn.Module): + def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: + return x[idx] + + +def _inputs(self_len, index_values): + # Distinct self values so a wrong-index gather is visible. + x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5 + idx = torch.tensor(index_values, dtype=torch.int64) + return x, idx + + +def _lower(x, idx): + ep = torch.export.export(IndexModule().eval(), (x, idx)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _export(x, idx): + return _lower(x, idx).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class TestIndex(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (n, iv) in CONFIGS.items(): + edge = _lower(*_inputs(n, iv)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (index {name})" + ) + self.assertTrue( + _op_delegated(edge, "index.Tensor"), + f"index.Tensor not delegated (fell back to CPU) for {name}", + ) + + def test_golden_matches_eager(self) -> None: + for _, (n, iv) in CONFIGS.items(): + x, idx = _inputs(n, iv) + torch.testing.assert_close(IndexModule()(x, idx), x[idx]) + + +def export_all_index_models(out_dir: str) -> None: + """Write index_.pte + .self/.idx/.golden.bin for every config.""" + os.makedirs(out_dir, exist_ok=True) + for name, (n, iv) in CONFIGS.items(): + x, idx = _inputs(n, iv) + golden = x[idx].contiguous().detach().numpy().astype(" golden {golden.size} floats") + + +if __name__ == "__main__": + unittest.main()