diff --git a/crates/rustc_codegen_spirv/src/abi.rs b/crates/rustc_codegen_spirv/src/abi.rs
index f1453a8c5f..da589937f4 100644
--- a/crates/rustc_codegen_spirv/src/abi.rs
+++ b/crates/rustc_codegen_spirv/src/abi.rs
@@ -3,6 +3,7 @@
 
 use crate::attr::{AggregatedSpirvAttributes, IntrinsicType};
 use crate::codegen_cx::CodegenCx;
+use crate::maybe_pqp_cg_ssa::traits::ConstCodegenMethods as _;
 use crate::spirv_type::SpirvType;
 use itertools::Itertools;
 use rspirv::spirv::{Dim, ImageFormat, StorageClass, Word};
@@ -885,6 +886,48 @@ fn trans_intrinsic_type<'tcx>(
     args: GenericArgsRef<'tcx>,
     intrinsic_type_attr: IntrinsicType,
 ) -> Result<Word, ErrorGuaranteed> {
+    trait FromScalarInt: Sized {
+        fn from_scalar_int(n: ScalarInt) -> Option<Self>;
+    }
+
+    impl FromScalarInt for u32 {
+        fn from_scalar_int(n: ScalarInt) -> Option<Self> {
+            Some(n.try_to_bits(Size::from_bits(32)).ok()?.try_into().unwrap())
+        }
+    }
+
+    impl FromScalarInt for Dim {
+        fn from_scalar_int(n: ScalarInt) -> Option<Self> {
+            Dim::from_u32(u32::from_scalar_int(n)?)
+        }
+    }
+
+    impl FromScalarInt for ImageFormat {
+        fn from_scalar_int(n: ScalarInt) -> Option<Self> {
+            ImageFormat::from_u32(u32::from_scalar_int(n)?)
+        }
+    }
+
+    fn const_int_value<'tcx, P: FromScalarInt>(
+        cx: &CodegenCx<'tcx>,
+        const_: Const<'tcx>,
+    ) -> Result<P, ErrorGuaranteed> {
+        let ty::Value {
+            ty: const_ty,
+            valtree: const_val,
+        } = const_.to_value();
+        assert!(const_ty.is_integral());
+        const_val
+            .try_to_scalar()
+            .and_then(|scalar| scalar.try_to_scalar_int().ok())
+            .and_then(P::from_scalar_int)
+            .ok_or_else(|| {
+                cx.tcx
+                    .dcx()
+                    .err(format!("invalid value for const generic: {const_}"))
+            })
+    }
+
     match intrinsic_type_attr {
         IntrinsicType::GenericImageType => {
             // see SpirvType::sizeof
@@ -948,48 +991,6 @@ fn trans_intrinsic_type<'tcx>(
             // let image_format: spirv::ImageFormat =
             //     type_from_variant_discriminant(cx, args.const_at(6));
 
-            trait FromScalarInt: Sized {
-                fn from_scalar_int(n: ScalarInt) -> Option<Self>;
-            }
-
-            impl FromScalarInt for u32 {
-                fn from_scalar_int(n: ScalarInt) -> Option<Self> {
-                    Some(n.try_to_bits(Size::from_bits(32)).ok()?.try_into().unwrap())
-                }
-            }
-
-            impl FromScalarInt for Dim {
-                fn from_scalar_int(n: ScalarInt) -> Option<Self> {
-                    Dim::from_u32(u32::from_scalar_int(n)?)
-                }
-            }
-
-            impl FromScalarInt for ImageFormat {
-                fn from_scalar_int(n: ScalarInt) -> Option<Self> {
-                    ImageFormat::from_u32(u32::from_scalar_int(n)?)
-                }
-            }
-
-            fn const_int_value<'tcx, P: FromScalarInt>(
-                cx: &CodegenCx<'tcx>,
-                const_: Const<'tcx>,
-            ) -> Result<P, ErrorGuaranteed> {
-                let ty::Value {
-                    ty: const_ty,
-                    valtree: const_val,
-                } = const_.to_value();
-                assert!(const_ty.is_integral());
-                const_val
-                    .try_to_scalar()
-                    .and_then(|scalar| scalar.try_to_scalar_int().ok())
-                    .and_then(P::from_scalar_int)
-                    .ok_or_else(|| {
-                        cx.tcx
-                            .dcx()
-                            .err(format!("invalid value for Image const generic: {const_}"))
-                    })
-            }
-
             let dim = const_int_value(cx, args.const_at(1))?;
             let depth = const_int_value(cx, args.const_at(2))?;
             let arrayed = const_int_value(cx, args.const_at(3))?;
@@ -1019,6 +1020,37 @@ fn trans_intrinsic_type<'tcx>(
             Ok(SpirvType::AccelerationStructureKhr.def(span, cx))
         }
         IntrinsicType::RayQueryKhr => Ok(SpirvType::RayQueryKhr.def(span, cx)),
+        IntrinsicType::CooperativeMatrixKhr => {
+            if ty.size != Size::from_bytes(4) {
+                return Err(cx.tcx.dcx().err("cooperative_matrix type must have size 4"));
+            }
+
+            // Generic arg 0: component type T
+            let component_type = cx.layout_of(args.type_at(0)).spirv_type(span, cx);
+            // Const generic 1: USE (MatrixA=0, MatrixB=1, MatrixAccumulator=2)
+            let usage = cx
+                .const_u32(const_int_value(cx, args.const_at(1))?)
+                .def_cx(cx);
+            // Const generic 2: ROWS
+            let rows = cx
+                .const_u32(const_int_value(cx, args.const_at(2))?)
+                .def_cx(cx);
+            // Const generic 3: COLS
+            let columns = cx
+                .const_u32(const_int_value(cx, args.const_at(3))?)
+                .def_cx(cx);
+            // Scope: Subgroup = 3
+            let scope = cx.const_u32(3).def_cx(cx);
+
+            Ok(SpirvType::CooperativeMatrixKhr {
+                component_type,
+                usage,
+                rows,
+                columns,
+                scope,
+            }
+            .def(span, cx))
+        }
         IntrinsicType::SampledImage => {
             // see SpirvType::sizeof
             if ty.size != Size::from_bytes(4) {
diff --git a/crates/rustc_codegen_spirv/src/attr.rs b/crates/rustc_codegen_spirv/src/attr.rs
index a8ff7df0a1..ae1651120b 100644
--- a/crates/rustc_codegen_spirv/src/attr.rs
+++ b/crates/rustc_codegen_spirv/src/attr.rs
@@ -69,6 +69,7 @@ pub enum IntrinsicType {
     TypedBuffer,
     Matrix,
     Vector,
+    CooperativeMatrixKhr,
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
diff --git a/crates/rustc_codegen_spirv/src/builder/builder_methods.rs b/crates/rustc_codegen_spirv/src/builder/builder_methods.rs
index 16fa2bca7a..4b76ca050f 100644
--- a/crates/rustc_codegen_spirv/src/builder/builder_methods.rs
+++ b/crates/rustc_codegen_spirv/src/builder/builder_methods.rs
@@ -454,6 +454,9 @@ impl<'a, 'tcx> Builder<'a, 'tcx> {
                 self.fatal("cannot memset acceleration structure")
             }
             SpirvType::RayQueryKhr => self.fatal("cannot memset ray query"),
+            SpirvType::CooperativeMatrixKhr { .. } => {
+                self.fatal("cannot memset cooperative matrix")
+            }
         }
     }
 
@@ -511,6 +514,9 @@ impl<'a, 'tcx> Builder<'a, 'tcx> {
                 self.fatal("cannot memset acceleration structure")
             }
             SpirvType::RayQueryKhr => self.fatal("cannot memset ray query"),
+            SpirvType::CooperativeMatrixKhr { .. } => {
+                self.fatal("cannot memset cooperative matrix")
+            }
         }
     }
 
diff --git a/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs b/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs
index 5597d38e3e..2932f3e423 100644
--- a/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs
+++ b/crates/rustc_codegen_spirv/src/codegen_cx/constant.rs
@@ -623,7 +623,8 @@ impl<'tcx> CodegenCx<'tcx> {
             | SpirvType::SampledImage { .. }
             | SpirvType::InterfaceBlock { .. }
             | SpirvType::AccelerationStructureKhr
-            | SpirvType::RayQueryKhr => {
+            | SpirvType::RayQueryKhr
+            | SpirvType::CooperativeMatrixKhr { .. } => {
                 let result = self.undef(ty);
                 self.zombie_no_span(
                     result.def_cx(self),
diff --git a/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs b/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs
index 94c64625ce..c2ef34ee28 100644
--- a/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs
+++ b/crates/rustc_codegen_spirv/src/codegen_cx/type_.rs
@@ -209,6 +209,7 @@ impl BaseTypeCodegenMethods for CodegenCx<'_> {
             | SpirvType::SampledImage { .. }
             | SpirvType::AccelerationStructureKhr
             | SpirvType::RayQueryKhr
+            | SpirvType::CooperativeMatrixKhr { .. }
                 => TypeKind::Token,
         }
     }
diff --git a/crates/rustc_codegen_spirv/src/spirv_type.rs b/crates/rustc_codegen_spirv/src/spirv_type.rs
index e5ab302dc1..feedf0778f 100644
--- a/crates/rustc_codegen_spirv/src/spirv_type.rs
+++ b/crates/rustc_codegen_spirv/src/spirv_type.rs
@@ -90,6 +90,13 @@ pub enum SpirvType<'tcx> {
 
     AccelerationStructureKhr,
     RayQueryKhr,
+    CooperativeMatrixKhr {
+        component_type: Word,
+        usage: Word,
+        rows: Word,
+        columns: Word,
+        scope: Word,
+    },
 }
 
 impl SpirvType<'_> {
@@ -193,6 +200,20 @@ impl SpirvType<'_> {
                 cx.emit_global().type_acceleration_structure_khr_id(id)
             }
             Self::RayQueryKhr => cx.emit_global().type_ray_query_khr_id(id),
+            Self::CooperativeMatrixKhr {
+                component_type,
+                scope,
+                rows,
+                columns,
+                usage: use_,
+            } => cx.emit_global().type_cooperative_matrix_khr_id(
+                id,
+                component_type,
+                scope,
+                rows,
+                columns,
+                use_,
+            ),
             Self::SampledImage { image_type } => {
                 cx.emit_global().type_sampled_image_id(id, image_type)
             }
@@ -339,6 +360,7 @@ impl SpirvType<'_> {
             Self::Image { .. }
             | Self::AccelerationStructureKhr
             | Self::RayQueryKhr
+            | Self::CooperativeMatrixKhr { .. }
             | Self::Sampler
             | Self::SampledImage { .. }
             | Self::InterfaceBlock { .. } => Size::from_bytes(4),
@@ -361,6 +383,7 @@ impl SpirvType<'_> {
             Self::Image { .. }
             | Self::AccelerationStructureKhr
             | Self::RayQueryKhr
+            | Self::CooperativeMatrixKhr { .. }
             | Self::Sampler
             | Self::SampledImage { .. }
             | Self::InterfaceBlock { .. } => Align::from_bytes(4).unwrap(),
@@ -389,7 +412,10 @@ impl SpirvType<'_> {
             Self::InterfaceBlock { .. } | Self::RayQueryKhr | Self::SampledImage { .. } => None,
 
             // Descriptor types
-            Self::Image { .. } | Self::AccelerationStructureKhr | Self::Sampler => None,
+            Self::Image { .. }
+            | Self::AccelerationStructureKhr
+            | Self::CooperativeMatrixKhr { .. }
+            | Self::Sampler => None,
 
             // Primitive types
             ty => ty.sizeof(cx),
@@ -455,6 +481,19 @@ impl SpirvType<'_> {
             SpirvType::InterfaceBlock { inner_type } => SpirvType::InterfaceBlock { inner_type },
             SpirvType::AccelerationStructureKhr => SpirvType::AccelerationStructureKhr,
             SpirvType::RayQueryKhr => SpirvType::RayQueryKhr,
+            SpirvType::CooperativeMatrixKhr {
+                component_type,
+                scope,
+                rows,
+                columns,
+                usage: use_,
+            } => SpirvType::CooperativeMatrixKhr {
+                component_type,
+                scope,
+                rows,
+                columns,
+                usage: use_,
+            },
 
             // Only these variants have any slices to arena-allocate.
             SpirvType::Adt {
@@ -644,6 +683,20 @@ impl fmt::Debug for SpirvTypePrinter<'_, '_> {
                 .finish(),
             SpirvType::AccelerationStructureKhr => f.debug_struct("AccelerationStructure").finish(),
             SpirvType::RayQueryKhr => f.debug_struct("RayQuery").finish(),
+            SpirvType::CooperativeMatrixKhr {
+                component_type,
+                scope,
+                rows,
+                columns,
+                usage: use_,
+            } => f
+                .debug_struct("CooperativeMatrix")
+                .field("component_type", &self.cx.debug_type(component_type))
+                .field("scope", &self.cx.debug_type(scope))
+                .field("rows", &self.cx.debug_type(rows))
+                .field("columns", &self.cx.debug_type(columns))
+                .field("use_", &self.cx.debug_type(use_))
+                .finish(),
         };
         {
             let mut debug_stack = DEBUG_STACK.lock().unwrap();
@@ -797,6 +850,7 @@ impl SpirvTypePrinter<'_, '_> {
             }
             SpirvType::AccelerationStructureKhr => f.write_str("AccelerationStructureKhr"),
             SpirvType::RayQueryKhr => f.write_str("RayQuery"),
+            SpirvType::CooperativeMatrixKhr { .. } => f.write_str("CooperativeMatrixKhr"),
         }
     }
 }
diff --git a/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs b/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs
index acefb8d3df..12972aeff0 100644
--- a/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs
+++ b/crates/rustc_codegen_spirv/src/spirv_type_constraints.rs
@@ -1086,10 +1086,10 @@ pub fn instruction_signatures(op: Op) -> Option<&'static [InstSig<'static>]> {
         }
         // SPV_KHR_cooperative_matrix
         Op::TypeCooperativeMatrixKHR
-        | Op::CooperativeMatrixLoadKHR
+        | Op::CooperativeMatrixLengthKHR
         | Op::CooperativeMatrixStoreKHR
-        | Op::CooperativeMatrixMulAddKHR
-        | Op::CooperativeMatrixLengthKHR => reserved!(SPV_KHR_cooperative_matrix),
+        | Op::CooperativeMatrixLoadKHR => {}
+        Op::CooperativeMatrixMulAddKHR => sig! { (_, _, T) -> T },
         // SPV_QCOM_image_processing
         Op::ImageSampleWeightedQCOM
         | Op::ImageBoxFilterQCOM
diff --git a/crates/rustc_codegen_spirv/src/symbols.rs b/crates/rustc_codegen_spirv/src/symbols.rs
index 371505feac..bae57f976c 100644
--- a/crates/rustc_codegen_spirv/src/symbols.rs
+++ b/crates/rustc_codegen_spirv/src/symbols.rs
@@ -356,6 +356,10 @@ impl Symbols {
                 "ray_query",
                 SpirvAttribute::IntrinsicType(IntrinsicType::RayQueryKhr),
             ),
+            (
+                "cooperative_matrix",
+                SpirvAttribute::IntrinsicType(IntrinsicType::CooperativeMatrixKhr),
+            ),
             ("block", SpirvAttribute::Block),
             ("flat", SpirvAttribute::Flat),
             ("invariant", SpirvAttribute::Invariant),
diff --git a/crates/spirv-std/src/cooperative_matrix.rs b/crates/spirv-std/src/cooperative_matrix.rs
new file mode 100644
index 0000000000..ee574cf006
--- /dev/null
+++ b/crates/spirv-std/src/cooperative_matrix.rs
@@ -0,0 +1,209 @@
+//! Cooperative matrix types and operations (`SPV_KHR_cooperative_matrix`).
+//!
+//! Requires the `CooperativeMatrixKHR` capability and `SPV_KHR_cooperative_matrix` extension:
+//! ```text
+//! -C target-feature=+CooperativeMatrixKHR,+ext:SPV_KHR_cooperative_matrix
+//! ```
+//!
+//! See the [SPV_KHR_cooperative_matrix specification] for full details.
+//!
+//! [SPV_KHR_cooperative_matrix specification]: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_cooperative_matrix.html
+#[cfg(target_arch = "spirv")]
+use core::arch::asm;
+use core::marker::PhantomData;
+use core::mem::MaybeUninit;
+
+/// Matrix role in a cooperative multiply-accumulate operation (`D = A × B + C`).
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[repr(u32)]
+pub enum MatrixUse {
+    /// Input operand A.
+    MatrixA = 0,
+    /// Input operand B.
+    MatrixB = 1,
+    /// Accumulator / result.
+    MatrixAccumulator = 2,
+}
+
+/// Memory layout for cooperative matrix load/store operations.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[repr(u32)]
+pub enum MatrixLayout {
+    /// Rows are stored contiguously.
+    RowMajor = 0,
+    /// Columns are stored contiguously.
+    ColumnMajor = 1,
+}
+
+/// Matrix role: input operand A in D = A × B + C.
+pub const MATRIX_A: u32 = MatrixUse::MatrixA as u32;
+/// Matrix role: input operand B in D = A × B + C.
+pub const MATRIX_B: u32 = MatrixUse::MatrixB as u32;
+/// Matrix role: accumulator / result in D = A × B + C.
+pub const MATRIX_ACCUMULATOR: u32 = MatrixUse::MatrixAccumulator as u32;
+
+/// Memory layout: rows are stored contiguously.
+pub const ROW_MAJOR: MatrixLayout = MatrixLayout::RowMajor;
+/// Memory layout: columns are stored contiguously.
+pub const COLUMN_MAJOR: MatrixLayout = MatrixLayout::ColumnMajor;
+
+/// A cooperative matrix distributed across the subgroup.
+///
+/// Each invocation holds a fragment of the full `ROWS × COLS` matrix.
+/// The hardware maps elements to invocations automatically.
+///
+/// # Type parameters
+/// - `T`: element type (`f32`, `f64`, `i32`, `u32`, `i8`, `u8`, etc.)
+/// - `USE`: matrix role — one of [`MatrixUse::MatrixA`], [`MatrixUse::MatrixB`], [`MatrixUse::MatrixAccumulator`] cast to `u32`
+/// - `ROWS`: number of rows
+/// - `COLS`: number of columns
+///
+/// # Capability
+/// Requires `CooperativeMatrixKHR` + `SPV_KHR_cooperative_matrix`.
+#[spirv(cooperative_matrix)]
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub struct CooperativeMatrix<T, const USE: u32, const ROWS: u32, const COLS: u32> {
+    // HACK: keeps the Rust layout non-ZST so #[spirv(cooperative_matrix)] can
+    // special-case it before it gets elided.
+    _anti_zst_padding: MaybeUninit<u32>,
+    _phantom: PhantomData<T>,
+}
+
+impl<T, const USE: u32, const ROWS: u32, const COLS: u32> CooperativeMatrix<T, USE, ROWS, COLS> {
+    /// Load a cooperative matrix through a pointer.
+    ///
+    /// `slice` must point into an array. `layout` specifies whether the matrix
+    /// is stored in row-major ([`MatrixLayout::RowMajor`]) or column-major
+    /// ([`MatrixLayout::ColumnMajor`]) order. `stride` is the number of elements
+    /// between the start of consecutive rows (row-major) or columns (column-major).
+    ///
+    /// The scope is always `Subgroup`.
+    ///
+    /// # Safety
+    /// - `slice` must point into an array and be valid for all element accesses
+    ///   implied by the matrix dimensions, layout, and stride.
+    /// - All operands must be dynamically uniform within every instance of the
+    ///   subgroup scope.
+    #[spirv_std_macros::gpu_only]
+    #[doc(alias = "OpCooperativeMatrixLoadKHR")]
+    #[inline]
+    pub unsafe fn load(slice: &[T], layout: MatrixLayout, stride: u32) -> Self {
+        unsafe {
+            let mut result = MaybeUninit::<Self>::uninit();
+            let layout_u32 = layout as u32;
+            let ptr = slice.as_ptr();
+            asm!(
+                "%u32 = OpTypeInt 32 0",
+                "%layout = OpLoad %u32 {layout}",
+                "%stride = OpLoad %u32 {stride}",
+                // Use typeof* to get the cooperative matrix type from the output pointer.
+                "%result = OpCooperativeMatrixLoadKHR typeof* {out} {ptr} %layout %stride",
+                "OpStore {out} %result",
+                ptr    = in(reg) ptr,
+                layout = in(reg) &layout_u32,
+                stride = in(reg) &stride,
+                out    = in(reg) result.as_mut_ptr(),
+            );
+            result.assume_init()
+        }
+    }
+
+    /// Store a cooperative matrix through a pointer.
+    ///
+    /// `slice` must point into an array. `layout` specifies whether the matrix
+    /// is stored in row-major ([`MatrixLayout::RowMajor`]) or column-major
+    /// ([`MatrixLayout::ColumnMajor`]) order. `stride` is the number of elements
+    /// between the start of consecutive rows (row-major) or columns (column-major).
+    ///
+    /// The scope is always `Subgroup`.
+    ///
+    /// # Safety
+    /// - `slice` must point into an array and be valid for all element accesses
+    ///   implied by the matrix dimensions, layout, and stride.
+    /// - All operands must be dynamically uniform within every instance of the
+    ///   subgroup scope.
+    #[spirv_std_macros::gpu_only]
+    #[doc(alias = "OpCooperativeMatrixStoreKHR")]
+    #[inline]
+    pub unsafe fn store(self, slice: &mut [T], layout: MatrixLayout, stride: u32) {
+        unsafe {
+            let layout_u32 = layout as u32;
+            let ptr = slice.as_mut_ptr();
+            asm!(
+                "%u32 = OpTypeInt 32 0",
+                "%layout = OpLoad %u32 {layout}",
+                "%stride = OpLoad %u32 {stride}",
+                "%matrix = OpLoad _ {matrix}",
+                "OpCooperativeMatrixStoreKHR {ptr} %matrix %layout %stride",
+                ptr    = in(reg) ptr,
+                matrix = in(reg) &self,
+                layout = in(reg) &layout_u32,
+                stride = in(reg) &stride,
+            );
+        }
+    }
+
+    /// Returns the number of matrix components this invocation is responsible for.
+    ///
+    /// The sum across all invocations in the subgroup equals `ROWS * COLS`.
+    #[spirv_std_macros::gpu_only]
+    #[doc(alias = "OpCooperativeMatrixLengthKHR")]
+    #[inline]
+    pub fn length(&self) -> u32 {
+        unsafe {
+            let mut result: u32 = 0;
+            asm!(
+                "%u32 = OpTypeInt 32 0",
+                // typeof* {self_ptr} resolves to the CooperativeMatrix type (pointee of &self).
+                "%coop_ty = typeof* {self_ptr}",
+                "%result = OpCooperativeMatrixLengthKHR %u32 %coop_ty",
+                "OpStore {out} %result",
+                self_ptr = in(reg) self,
+                out      = in(reg) &mut result,
+            );
+            result
+        }
+    }
+}
+
+/// Linear-algebraic matrix multiply of `A` by `B` and then component-wise add `C`.
+///
+/// The order of operations is implementation-dependent. All matrices must have the
+/// same scope, which is always subgroup here.
+///
+/// - `A`: `M × K` matrix with use [`MatrixUse::MatrixA`]
+/// - `B`: `K × N` matrix with use [`MatrixUse::MatrixB`]
+/// - `C`: `M × N` matrix with use [`MatrixUse::MatrixAccumulator`]
+/// - returns `D`: `M × N` accumulator equal to `A × B + C`
+///
+/// All operands must be dynamically uniform within every instance of the subgroup scope.
+///
+/// # Capability
+/// Requires `CooperativeMatrixKHR` + `SPV_KHR_cooperative_matrix`.
+#[spirv_std_macros::gpu_only]
+#[doc(alias = "OpCooperativeMatrixMulAddKHR")]
+#[inline]
+pub fn mul_add<TA, TB, TC, const M: u32, const N: u32, const K: u32>(
+    a: CooperativeMatrix<TA, { MatrixUse::MatrixA as u32 }, M, K>,
+    b: CooperativeMatrix<TB, { MatrixUse::MatrixB as u32 }, K, N>,
+    c: CooperativeMatrix<TC, { MatrixUse::MatrixAccumulator as u32 }, M, N>,
+) -> CooperativeMatrix<TC, { MatrixUse::MatrixAccumulator as u32 }, M, N> {
+    unsafe {
+        let mut result = MaybeUninit::<
+            CooperativeMatrix<TC, { MatrixUse::MatrixAccumulator as u32 }, M, N>,
+        >::uninit();
+        asm!(
+            "%a      = OpLoad _ {a}",
+            "%b      = OpLoad _ {b}",
+            "%c      = OpLoad _ {c}",
+            "%result = OpCooperativeMatrixMulAddKHR _ %a %b %c",
+            "OpStore {out} %result",
+            a   = in(reg) &a,
+            b   = in(reg) &b,
+            c   = in(reg) &c,
+            out = in(reg) result.as_mut_ptr(),
+        );
+        result.assume_init()
+    }
+}
diff --git a/crates/spirv-std/src/lib.rs b/crates/spirv-std/src/lib.rs
index 54d0151f15..288432b291 100644
--- a/crates/spirv-std/src/lib.rs
+++ b/crates/spirv-std/src/lib.rs
@@ -92,6 +92,7 @@ pub use macros::{debug_printf, debug_printfln};
 
 pub mod arch;
 pub mod byte_addressable_buffer;
+pub mod cooperative_matrix;
 pub mod debug_printf;
 pub mod float;
 pub mod image;
diff --git a/tests/compiletests/ui/arch/cooperative_matrix.rs b/tests/compiletests/ui/arch/cooperative_matrix.rs
new file mode 100644
index 0000000000..9574d9de05
--- /dev/null
+++ b/tests/compiletests/ui/arch/cooperative_matrix.rs
@@ -0,0 +1,27 @@
+// build-pass
+// only-vulkan1.2
+// compile-flags: -C target-feature=+CooperativeMatrixKHR,+ext:SPV_KHR_cooperative_matrix
+
+use spirv_std::cooperative_matrix::{
+    COLUMN_MAJOR, CooperativeMatrix, MATRIX_A, MATRIX_ACCUMULATOR, MATRIX_B, ROW_MAJOR, mul_add,
+};
+use spirv_std::spirv;
+
+type MatA = CooperativeMatrix<f32, MATRIX_A, 16, 16>;
+type MatB = CooperativeMatrix<f32, MATRIX_B, 16, 16>;
+type MatAcc = CooperativeMatrix<f32, MATRIX_ACCUMULATOR, 16, 16>;
+
+#[spirv(compute(threads(32)))]
+pub fn main(
+    #[spirv(storage_buffer, descriptor_set = 0, binding = 0)] a: &[f32],
+    #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] b: &[f32],
+    #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] c: &mut [f32],
+) {
+    let mat_a = unsafe { MatA::load(a, ROW_MAJOR, 16) };
+    let mat_b = unsafe { MatB::load(b, COLUMN_MAJOR, 16) };
+    let mat_c = unsafe { MatAcc::load(c, ROW_MAJOR, 16) };
+
+    let result = mul_add(mat_a, mat_b, mat_c);
+
+    unsafe { result.store(c, ROW_MAJOR, 16) };
+}