diff --git a/crates/rustc_codegen_spirv/src/abi.rs b/crates/rustc_codegen_spirv/src/abi.rs index f1453a8c5f..da589937f4 100644 --- a/crates/rustc_codegen_spirv/src/abi.rs +++ b/crates/rustc_codegen_spirv/src/abi.rs @@ -3,6 +3,7 @@ use crate::attr::{AggregatedSpirvAttributes, IntrinsicType}; use crate::codegen_cx::CodegenCx; +use crate::maybe_pqp_cg_ssa::traits::ConstCodegenMethods as _; use crate::spirv_type::SpirvType; use itertools::Itertools; use rspirv::spirv::{Dim, ImageFormat, StorageClass, Word}; @@ -885,6 +886,48 @@ fn trans_intrinsic_type<'tcx>( args: GenericArgsRef<'tcx>, intrinsic_type_attr: IntrinsicType, ) -> Result { + trait FromScalarInt: Sized { + fn from_scalar_int(n: ScalarInt) -> Option; + } + + impl FromScalarInt for u32 { + fn from_scalar_int(n: ScalarInt) -> Option { + Some(n.try_to_bits(Size::from_bits(32)).ok()?.try_into().unwrap()) + } + } + + impl FromScalarInt for Dim { + fn from_scalar_int(n: ScalarInt) -> Option { + Dim::from_u32(u32::from_scalar_int(n)?) + } + } + + impl FromScalarInt for ImageFormat { + fn from_scalar_int(n: ScalarInt) -> Option { + ImageFormat::from_u32(u32::from_scalar_int(n)?) + } + } + + fn const_int_value<'tcx, P: FromScalarInt>( + cx: &CodegenCx<'tcx>, + const_: Const<'tcx>, + ) -> Result { + let ty::Value { + ty: const_ty, + valtree: const_val, + } = const_.to_value(); + assert!(const_ty.is_integral()); + const_val + .try_to_scalar() + .and_then(|scalar| scalar.try_to_scalar_int().ok()) + .and_then(P::from_scalar_int) + .ok_or_else(|| { + cx.tcx + .dcx() + .err(format!("invalid value for const generic: {const_}")) + }) + } + match intrinsic_type_attr { IntrinsicType::GenericImageType => { // see SpirvType::sizeof @@ -948,48 +991,6 @@ fn trans_intrinsic_type<'tcx>( // let image_format: spirv::ImageFormat = // type_from_variant_discriminant(cx, args.const_at(6)); - trait FromScalarInt: Sized { - fn from_scalar_int(n: ScalarInt) -> Option; - } - - impl FromScalarInt for u32 { - fn from_scalar_int(n: ScalarInt) -> Option { - Some(n.try_to_bits(Size::from_bits(32)).ok()?.try_into().unwrap()) - } - } - - impl FromScalarInt for Dim { - fn from_scalar_int(n: ScalarInt) -> Option { - Dim::from_u32(u32::from_scalar_int(n)?) - } - } - - impl FromScalarInt for ImageFormat { - fn from_scalar_int(n: ScalarInt) -> Option { - ImageFormat::from_u32(u32::from_scalar_int(n)?) - } - } - - fn const_int_value<'tcx, P: FromScalarInt>( - cx: &CodegenCx<'tcx>, - const_: Const<'tcx>, - ) -> Result { - let ty::Value { - ty: const_ty, - valtree: const_val, - } = const_.to_value(); - assert!(const_ty.is_integral()); - const_val - .try_to_scalar() - .and_then(|scalar| scalar.try_to_scalar_int().ok()) - .and_then(P::from_scalar_int) - .ok_or_else(|| { - cx.tcx - .dcx() - .err(format!("invalid value for Image const generic: {const_}")) - }) - } - let dim = const_int_value(cx, args.const_at(1))?; let depth = const_int_value(cx, args.const_at(2))?; let arrayed = const_int_value(cx, args.const_at(3))?; @@ -1019,6 +1020,37 @@ fn trans_intrinsic_type<'tcx>( Ok(SpirvType::AccelerationStructureKhr.def(span, cx)) } IntrinsicType::RayQueryKhr => Ok(SpirvType::RayQueryKhr.def(span, cx)), + IntrinsicType::CooperativeMatrixKhr => { + if ty.size != Size::from_bytes(4) { + return Err(cx.tcx.dcx().err("cooperative_matrix type must have size 4")); + } + + // Generic arg 0: component type T + let component_type = cx.layout_of(args.type_at(0)).spirv_type(span, cx); + // Const generic 1: USE (MatrixA=0, MatrixB=1, MatrixAccumulator=2) + let usage = cx + .const_u32(const_int_value(cx, args.const_at(1))?) + .def_cx(cx); + // Const generic 2: ROWS + let rows = cx + .const_u32(const_int_value(cx, args.const_at(2))?) + .def_cx(cx); + // Const generic 3: COLS + let columns = cx + .const_u32(const_int_value(cx, args.const_at(3))?) + .def_cx(cx); + // Scope: Subgroup = 3 + let scope = cx.const_u32(3).def_cx(cx); + + Ok(SpirvType::CooperativeMatrixKhr { + component_type, + usage, + rows, + columns, + scope, + } + .def(span, cx)) + } IntrinsicType::SampledImage => { // see SpirvType::sizeof if ty.size != Size::from_bytes(4) { diff --git a/crates/rustc_codegen_spirv/src/attr.rs b/crates/rustc_codegen_spirv/src/attr.rs index a8ff7df0a1..ae1651120b 100644 --- a/crates/rustc_codegen_spirv/src/attr.rs +++ b/crates/rustc_codegen_spirv/src/attr.rs @@ -69,6 +69,7 @@ pub enum IntrinsicType { TypedBuffer, Matrix, Vector, + CooperativeMatrixKhr, } #[derive(Copy, Clone, Debug, PartialEq, Eq)] diff --git a/crates/rustc_codegen_spirv/src/builder/builder_methods.rs b/crates/rustc_codegen_spirv/src/builder/builder_methods.rs index 16fa2bca7a..4b76ca050f 100644 --- a/crates/rustc_codegen_spirv/src/builder/builder_methods.rs +++ b/crates/rustc_codegen_spirv/src/builder/builder_methods.rs @@ -454,6 +454,9 @@ impl<'a, 'tcx> Builder<'a, 'tcx> { self.fatal("cannot memset acceleration structure") } SpirvType::RayQueryKhr => self.fatal("cannot memset ray query"), + SpirvType::CooperativeMatrixKhr { .. } => { + self.fatal("cannot memset cooperative matrix") + } } } @@ -511,6 +514,9 @@ impl<'a, 'tcx> Builder<'a, 'tcx> { self.fatal("cannot memset acceleration structure") } SpirvType::RayQueryKhr => self.fatal("cannot memset ray query"), + SpirvType::CooperativeMatrixKhr { .. } => { + self.fatal("cannot memset cooperative matrix") + } } } diff --git a/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs b/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs index 5597d38e3e..2932f3e423 100644 --- a/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs +++ b/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs @@ -623,7 +623,8 @@ impl<'tcx> CodegenCx<'tcx> { | SpirvType::SampledImage { .. } | SpirvType::InterfaceBlock { .. } | SpirvType::AccelerationStructureKhr - | SpirvType::RayQueryKhr => { + | SpirvType::RayQueryKhr + | SpirvType::CooperativeMatrixKhr { .. } => { let result = self.undef(ty); self.zombie_no_span( result.def_cx(self), diff --git a/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs b/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs index 94c64625ce..c2ef34ee28 100644 --- a/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs +++ b/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs @@ -209,6 +209,7 @@ impl BaseTypeCodegenMethods for CodegenCx<'_> { | SpirvType::SampledImage { .. } | SpirvType::AccelerationStructureKhr | SpirvType::RayQueryKhr + | SpirvType::CooperativeMatrixKhr { .. } => TypeKind::Token, } } diff --git a/crates/rustc_codegen_spirv/src/spirv_type.rs b/crates/rustc_codegen_spirv/src/spirv_type.rs index e5ab302dc1..feedf0778f 100644 --- a/crates/rustc_codegen_spirv/src/spirv_type.rs +++ b/crates/rustc_codegen_spirv/src/spirv_type.rs @@ -90,6 +90,13 @@ pub enum SpirvType<'tcx> { AccelerationStructureKhr, RayQueryKhr, + CooperativeMatrixKhr { + component_type: Word, + usage: Word, + rows: Word, + columns: Word, + scope: Word, + }, } impl SpirvType<'_> { @@ -193,6 +200,20 @@ impl SpirvType<'_> { cx.emit_global().type_acceleration_structure_khr_id(id) } Self::RayQueryKhr => cx.emit_global().type_ray_query_khr_id(id), + Self::CooperativeMatrixKhr { + component_type, + scope, + rows, + columns, + usage: use_, + } => cx.emit_global().type_cooperative_matrix_khr_id( + id, + component_type, + scope, + rows, + columns, + use_, + ), Self::SampledImage { image_type } => { cx.emit_global().type_sampled_image_id(id, image_type) } @@ -339,6 +360,7 @@ impl SpirvType<'_> { Self::Image { .. } | Self::AccelerationStructureKhr | Self::RayQueryKhr + | Self::CooperativeMatrixKhr { .. } | Self::Sampler | Self::SampledImage { .. } | Self::InterfaceBlock { .. } => Size::from_bytes(4), @@ -361,6 +383,7 @@ impl SpirvType<'_> { Self::Image { .. } | Self::AccelerationStructureKhr | Self::RayQueryKhr + | Self::CooperativeMatrixKhr { .. } | Self::Sampler | Self::SampledImage { .. } | Self::InterfaceBlock { .. } => Align::from_bytes(4).unwrap(), @@ -389,7 +412,10 @@ impl SpirvType<'_> { Self::InterfaceBlock { .. } | Self::RayQueryKhr | Self::SampledImage { .. } => None, // Descriptor types - Self::Image { .. } | Self::AccelerationStructureKhr | Self::Sampler => None, + Self::Image { .. } + | Self::AccelerationStructureKhr + | Self::CooperativeMatrixKhr { .. } + | Self::Sampler => None, // Primitive types ty => ty.sizeof(cx), @@ -455,6 +481,19 @@ impl SpirvType<'_> { SpirvType::InterfaceBlock { inner_type } => SpirvType::InterfaceBlock { inner_type }, SpirvType::AccelerationStructureKhr => SpirvType::AccelerationStructureKhr, SpirvType::RayQueryKhr => SpirvType::RayQueryKhr, + SpirvType::CooperativeMatrixKhr { + component_type, + scope, + rows, + columns, + usage: use_, + } => SpirvType::CooperativeMatrixKhr { + component_type, + scope, + rows, + columns, + usage: use_, + }, // Only these variants have any slices to arena-allocate. SpirvType::Adt { @@ -644,6 +683,20 @@ impl fmt::Debug for SpirvTypePrinter<'_, '_> { .finish(), SpirvType::AccelerationStructureKhr => f.debug_struct("AccelerationStructure").finish(), SpirvType::RayQueryKhr => f.debug_struct("RayQuery").finish(), + SpirvType::CooperativeMatrixKhr { + component_type, + scope, + rows, + columns, + usage: use_, + } => f + .debug_struct("CooperativeMatrix") + .field("component_type", &self.cx.debug_type(component_type)) + .field("scope", &self.cx.debug_type(scope)) + .field("rows", &self.cx.debug_type(rows)) + .field("columns", &self.cx.debug_type(columns)) + .field("use_", &self.cx.debug_type(use_)) + .finish(), }; { let mut debug_stack = DEBUG_STACK.lock().unwrap(); @@ -797,6 +850,7 @@ impl SpirvTypePrinter<'_, '_> { } SpirvType::AccelerationStructureKhr => f.write_str("AccelerationStructureKhr"), SpirvType::RayQueryKhr => f.write_str("RayQuery"), + SpirvType::CooperativeMatrixKhr { .. } => f.write_str("CooperativeMatrixKhr"), } } } diff --git a/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs b/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs index acefb8d3df..12972aeff0 100644 --- a/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs +++ b/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs @@ -1086,10 +1086,10 @@ pub fn instruction_signatures(op: Op) -> Option<&'static [InstSig<'static>]> { } // SPV_KHR_cooperative_matrix Op::TypeCooperativeMatrixKHR - | Op::CooperativeMatrixLoadKHR + | Op::CooperativeMatrixLengthKHR | Op::CooperativeMatrixStoreKHR - | Op::CooperativeMatrixMulAddKHR - | Op::CooperativeMatrixLengthKHR => reserved!(SPV_KHR_cooperative_matrix), + | Op::CooperativeMatrixLoadKHR => {} + Op::CooperativeMatrixMulAddKHR => sig! { (_, _, T) -> T }, // SPV_QCOM_image_processing Op::ImageSampleWeightedQCOM | Op::ImageBoxFilterQCOM diff --git a/crates/rustc_codegen_spirv/src/symbols.rs b/crates/rustc_codegen_spirv/src/symbols.rs index 371505feac..bae57f976c 100644 --- a/crates/rustc_codegen_spirv/src/symbols.rs +++ b/crates/rustc_codegen_spirv/src/symbols.rs @@ -356,6 +356,10 @@ impl Symbols { "ray_query", SpirvAttribute::IntrinsicType(IntrinsicType::RayQueryKhr), ), + ( + "cooperative_matrix", + SpirvAttribute::IntrinsicType(IntrinsicType::CooperativeMatrixKhr), + ), ("block", SpirvAttribute::Block), ("flat", SpirvAttribute::Flat), ("invariant", SpirvAttribute::Invariant), diff --git a/crates/spirv-std/src/cooperative_matrix.rs b/crates/spirv-std/src/cooperative_matrix.rs new file mode 100644 index 0000000000..ee574cf006 --- /dev/null +++ b/crates/spirv-std/src/cooperative_matrix.rs @@ -0,0 +1,209 @@ +//! Cooperative matrix types and operations (`SPV_KHR_cooperative_matrix`). +//! +//! Requires the `CooperativeMatrixKHR` capability and `SPV_KHR_cooperative_matrix` extension: +//! ```text +//! -C target-feature=+CooperativeMatrixKHR,+ext:SPV_KHR_cooperative_matrix +//! ``` +//! +//! See the [SPV_KHR_cooperative_matrix specification] for full details. +//! +//! [SPV_KHR_cooperative_matrix specification]: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_cooperative_matrix.html +#[cfg(target_arch = "spirv")] +use core::arch::asm; +use core::marker::PhantomData; +use core::mem::MaybeUninit; + +/// Matrix role in a cooperative multiply-accumulate operation (`D = A × B + C`). +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[repr(u32)] +pub enum MatrixUse { + /// Input operand A. + MatrixA = 0, + /// Input operand B. + MatrixB = 1, + /// Accumulator / result. + MatrixAccumulator = 2, +} + +/// Memory layout for cooperative matrix load/store operations. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[repr(u32)] +pub enum MatrixLayout { + /// Rows are stored contiguously. + RowMajor = 0, + /// Columns are stored contiguously. + ColumnMajor = 1, +} + +/// Matrix role: input operand A in D = A × B + C. +pub const MATRIX_A: u32 = MatrixUse::MatrixA as u32; +/// Matrix role: input operand B in D = A × B + C. +pub const MATRIX_B: u32 = MatrixUse::MatrixB as u32; +/// Matrix role: accumulator / result in D = A × B + C. +pub const MATRIX_ACCUMULATOR: u32 = MatrixUse::MatrixAccumulator as u32; + +/// Memory layout: rows are stored contiguously. +pub const ROW_MAJOR: MatrixLayout = MatrixLayout::RowMajor; +/// Memory layout: columns are stored contiguously. +pub const COLUMN_MAJOR: MatrixLayout = MatrixLayout::ColumnMajor; + +/// A cooperative matrix distributed across the subgroup. +/// +/// Each invocation holds a fragment of the full `ROWS × COLS` matrix. +/// The hardware maps elements to invocations automatically. +/// +/// # Type parameters +/// - `T`: element type (`f32`, `f64`, `i32`, `u32`, `i8`, `u8`, etc.) +/// - `USE`: matrix role — one of [`MatrixUse::MatrixA`], [`MatrixUse::MatrixB`], [`MatrixUse::MatrixAccumulator`] cast to `u32` +/// - `ROWS`: number of rows +/// - `COLS`: number of columns +/// +/// # Capability +/// Requires `CooperativeMatrixKHR` + `SPV_KHR_cooperative_matrix`. +#[spirv(cooperative_matrix)] +#[derive(Copy, Clone)] +#[repr(C)] +pub struct CooperativeMatrix { + // HACK: keeps the Rust layout non-ZST so #[spirv(cooperative_matrix)] can + // special-case it before it gets elided. + _anti_zst_padding: MaybeUninit, + _phantom: PhantomData, +} + +impl CooperativeMatrix { + /// Load a cooperative matrix through a pointer. + /// + /// `slice` must point into an array. `layout` specifies whether the matrix + /// is stored in row-major ([`MatrixLayout::RowMajor`]) or column-major + /// ([`MatrixLayout::ColumnMajor`]) order. `stride` is the number of elements + /// between the start of consecutive rows (row-major) or columns (column-major). + /// + /// The scope is always `Subgroup`. + /// + /// # Safety + /// - `slice` must point into an array and be valid for all element accesses + /// implied by the matrix dimensions, layout, and stride. + /// - All operands must be dynamically uniform within every instance of the + /// subgroup scope. + #[spirv_std_macros::gpu_only] + #[doc(alias = "OpCooperativeMatrixLoadKHR")] + #[inline] + pub unsafe fn load(slice: &[T], layout: MatrixLayout, stride: u32) -> Self { + unsafe { + let mut result = MaybeUninit::::uninit(); + let layout_u32 = layout as u32; + let ptr = slice.as_ptr(); + asm!( + "%u32 = OpTypeInt 32 0", + "%layout = OpLoad %u32 {layout}", + "%stride = OpLoad %u32 {stride}", + // Use typeof* to get the cooperative matrix type from the output pointer. + "%result = OpCooperativeMatrixLoadKHR typeof* {out} {ptr} %layout %stride", + "OpStore {out} %result", + ptr = in(reg) ptr, + layout = in(reg) &layout_u32, + stride = in(reg) &stride, + out = in(reg) result.as_mut_ptr(), + ); + result.assume_init() + } + } + + /// Store a cooperative matrix through a pointer. + /// + /// `slice` must point into an array. `layout` specifies whether the matrix + /// is stored in row-major ([`MatrixLayout::RowMajor`]) or column-major + /// ([`MatrixLayout::ColumnMajor`]) order. `stride` is the number of elements + /// between the start of consecutive rows (row-major) or columns (column-major). + /// + /// The scope is always `Subgroup`. + /// + /// # Safety + /// - `slice` must point into an array and be valid for all element accesses + /// implied by the matrix dimensions, layout, and stride. + /// - All operands must be dynamically uniform within every instance of the + /// subgroup scope. + #[spirv_std_macros::gpu_only] + #[doc(alias = "OpCooperativeMatrixStoreKHR")] + #[inline] + pub unsafe fn store(self, slice: &mut [T], layout: MatrixLayout, stride: u32) { + unsafe { + let layout_u32 = layout as u32; + let ptr = slice.as_mut_ptr(); + asm!( + "%u32 = OpTypeInt 32 0", + "%layout = OpLoad %u32 {layout}", + "%stride = OpLoad %u32 {stride}", + "%matrix = OpLoad _ {matrix}", + "OpCooperativeMatrixStoreKHR {ptr} %matrix %layout %stride", + ptr = in(reg) ptr, + matrix = in(reg) &self, + layout = in(reg) &layout_u32, + stride = in(reg) &stride, + ); + } + } + + /// Returns the number of matrix components this invocation is responsible for. + /// + /// The sum across all invocations in the subgroup equals `ROWS * COLS`. + #[spirv_std_macros::gpu_only] + #[doc(alias = "OpCooperativeMatrixLengthKHR")] + #[inline] + pub fn length(&self) -> u32 { + unsafe { + let mut result: u32 = 0; + asm!( + "%u32 = OpTypeInt 32 0", + // typeof* {self_ptr} resolves to the CooperativeMatrix type (pointee of &self). + "%coop_ty = typeof* {self_ptr}", + "%result = OpCooperativeMatrixLengthKHR %u32 %coop_ty", + "OpStore {out} %result", + self_ptr = in(reg) self, + out = in(reg) &mut result, + ); + result + } + } +} + +/// Linear-algebraic matrix multiply of `A` by `B` and then component-wise add `C`. +/// +/// The order of operations is implementation-dependent. All matrices must have the +/// same scope, which is always subgroup here. +/// +/// - `A`: `M × K` matrix with use [`MatrixUse::MatrixA`] +/// - `B`: `K × N` matrix with use [`MatrixUse::MatrixB`] +/// - `C`: `M × N` matrix with use [`MatrixUse::MatrixAccumulator`] +/// - returns `D`: `M × N` accumulator equal to `A × B + C` +/// +/// All operands must be dynamically uniform within every instance of the subgroup scope. +/// +/// # Capability +/// Requires `CooperativeMatrixKHR` + `SPV_KHR_cooperative_matrix`. +#[spirv_std_macros::gpu_only] +#[doc(alias = "OpCooperativeMatrixMulAddKHR")] +#[inline] +pub fn mul_add( + a: CooperativeMatrix, + b: CooperativeMatrix, + c: CooperativeMatrix, +) -> CooperativeMatrix { + unsafe { + let mut result = MaybeUninit::< + CooperativeMatrix, + >::uninit(); + asm!( + "%a = OpLoad _ {a}", + "%b = OpLoad _ {b}", + "%c = OpLoad _ {c}", + "%result = OpCooperativeMatrixMulAddKHR _ %a %b %c", + "OpStore {out} %result", + a = in(reg) &a, + b = in(reg) &b, + c = in(reg) &c, + out = in(reg) result.as_mut_ptr(), + ); + result.assume_init() + } +} diff --git a/crates/spirv-std/src/lib.rs b/crates/spirv-std/src/lib.rs index 54d0151f15..288432b291 100644 --- a/crates/spirv-std/src/lib.rs +++ b/crates/spirv-std/src/lib.rs @@ -92,6 +92,7 @@ pub use macros::{debug_printf, debug_printfln}; pub mod arch; pub mod byte_addressable_buffer; +pub mod cooperative_matrix; pub mod debug_printf; pub mod float; pub mod image; diff --git a/tests/compiletests/ui/arch/cooperative_matrix.rs b/tests/compiletests/ui/arch/cooperative_matrix.rs new file mode 100644 index 0000000000..9574d9de05 --- /dev/null +++ b/tests/compiletests/ui/arch/cooperative_matrix.rs @@ -0,0 +1,27 @@ +// build-pass +// only-vulkan1.2 +// compile-flags: -C target-feature=+CooperativeMatrixKHR,+ext:SPV_KHR_cooperative_matrix + +use spirv_std::cooperative_matrix::{ + COLUMN_MAJOR, CooperativeMatrix, MATRIX_A, MATRIX_ACCUMULATOR, MATRIX_B, ROW_MAJOR, mul_add, +}; +use spirv_std::spirv; + +type MatA = CooperativeMatrix; +type MatB = CooperativeMatrix; +type MatAcc = CooperativeMatrix; + +#[spirv(compute(threads(32)))] +pub fn main( + #[spirv(storage_buffer, descriptor_set = 0, binding = 0)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] c: &mut [f32], +) { + let mat_a = unsafe { MatA::load(a, ROW_MAJOR, 16) }; + let mat_b = unsafe { MatB::load(b, COLUMN_MAJOR, 16) }; + let mat_c = unsafe { MatAcc::load(c, ROW_MAJOR, 16) }; + + let result = mul_add(mat_a, mat_b, mat_c); + + unsafe { result.store(c, ROW_MAJOR, 16) }; +}