31 files changed, 4544 insertions, 0 deletions
diff --git a/subprojects/pixpat/pixpat-native/src/color.h b/subprojects/pixpat/pixpat-native/src/color.h
new file mode 100644
index 0000000..16dfb7d
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/color.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "layout.h"
+
+namespace pixpat
+{
+
+// BT.601 / BT.709 / BT.2020 × Limited / Full range, dispatched at
+// runtime via a small `ColorCoeffs` struct that the caller hoists out
+// of the per-pixel loop. The convert and pattern entry points compute
+// `coeffs_for(spec)` once before the stripe loop, then pass the
+// resulting struct into every `ColorXfm::apply()` in the inner loop.
+// This avoids per-pixel matrix branching and also the alternative of
+// a 6×-instantiated template (would push the hot pivot from 121 to
+// 726 `Converter` bodies). The coefficient values are loop-invariant
+// broadcast scalars, so the compiler vectorizes the inner loop with
+// vbroadcastss + vmulps in place of constant folds.
+//
+// Math runs in float.
+
+enum class Rec   : uint8_t { BT601, BT709, BT2020 };
+enum class Range : uint8_t { Limited, Full };
+
+struct ColorSpec {
+	Rec rec;
+	Range range;
+	constexpr bool operator==(const ColorSpec&) const = default;
+};
+
+inline constexpr ColorSpec kDefaultColorSpec{ Rec::BT601, Range::Limited };
+
+struct ColorCoeffs {
+	// RGB->YUV
+	float kr, kg, kb;
+	float y_scale, y_offset;
+	float c_scale, c_offset;
+	float u_factor, v_factor;
+	// YUV->RGB
+	float y_inv, c_inv;
+	float gu, gv, ru, bv;
+	// normalized 16-bit scale (kNormMax in float, plus its inverse)
+	float norm_scale, norm_inv_scale;
+};
+
+namespace detail
+{
+constexpr ColorCoeffs make_coeffs(float kr, float kg, float kb, bool full) noexcept
+{
+	const float y_min = full ? 0.0f :  16.0f / 255.0f;
+	const float y_max = full ? 1.0f : 235.0f / 255.0f;
+	const float c_min = full ? 0.0f :  16.0f / 255.0f;
+	const float c_max = full ? 1.0f : 240.0f / 255.0f;
+
+	const float y_scale  = y_max - y_min;
+	const float y_offset = y_min;
+	const float c_scale  = c_max - c_min;
+	const float c_offset = (c_max + c_min) * 0.5f;
+
+	const float u_factor = 1.0f / (2.0f * (1.0f - kb));
+	const float v_factor = 1.0f / (2.0f * (1.0f - kr));
+	const float y_inv = 1.0f / y_scale;
+	const float c_inv = 1.0f / c_scale;
+	const float gu = -2.0f * (1.0f - kb) * kb / kg;
+	const float gv = -2.0f * (1.0f - kr) * kr / kg;
+	const float ru =  2.0f * (1.0f - kr);
+	const float bv =  2.0f * (1.0f - kb);
+
+	const float norm_scale     = float(kNormMax);
+	const float norm_inv_scale = 1.0f / norm_scale;
+
+	return ColorCoeffs{
+	        kr, kg, kb,
+	        y_scale, y_offset,
+	        c_scale, c_offset,
+	        u_factor, v_factor,
+	        y_inv, c_inv,
+	        gu, gv, ru, bv,
+	        norm_scale, norm_inv_scale,
+	};
+}
+} // namespace detail
+
+constexpr ColorCoeffs coeffs_for(ColorSpec spec) noexcept
+{
+	const bool full = spec.range == Range::Full;
+	switch (spec.rec) {
+	case Rec::BT601:  return detail::make_coeffs(0.299f,  0.587f,  0.114f,  full);
+	case Rec::BT2020: return detail::make_coeffs(0.2627f, 0.6780f, 0.0593f, full);
+	default:          return detail::make_coeffs(0.2126f, 0.7152f, 0.0722f, full);
+	}
+}
+
+template <typename SrcPix, typename DstPix>
+struct ColorXfm;
+
+template <>
+struct ColorXfm<RGB16, RGB16> {
+	static constexpr RGB16 apply(RGB16 p) noexcept {
+		return p;
+	}
+	static constexpr RGB16 apply(RGB16 p, const ColorCoeffs&) noexcept {
+		return p;
+	}
+};
+
+template <>
+struct ColorXfm<YUV16, YUV16> {
+	static constexpr YUV16 apply(YUV16 p) noexcept {
+		return p;
+	}
+	static constexpr YUV16 apply(YUV16 p, const ColorCoeffs&) noexcept {
+		return p;
+	}
+};
+
+// Cross-color-kind conversions reset `a` to kNormMax (sinks with X
+// write 0; sinks with A see fully opaque pixels). Within the same
+// color kind, identity ColorXfm propagates `a` unchanged.
+template <>
+struct ColorXfm<RGB16, YUV16> {
+	static YUV16 apply(RGB16 rgb, const ColorCoeffs& c) noexcept
+	{
+		const float r = float(rgb.r) * c.norm_inv_scale;
+		const float g = float(rgb.g) * c.norm_inv_scale;
+		const float b = float(rgb.b) * c.norm_inv_scale;
+
+		const float yp = c.kr * r + c.kg * g + c.kb * b;
+		const float u  = (b - yp) * c.u_factor;
+		const float v  = (r - yp) * c.v_factor;
+
+		// No clamp on RGB→YUV: for any uint16_t (RGB) input the
+		// output Y/U/V is structurally in [0, 1] (limited-range
+		// chroma stays within [c_min, c_max] ⊂ [0, 1]). The +0.5
+		// rounds half-up before the integer cast.
+		return YUV16{
+		        uint16_t((yp * c.y_scale + c.y_offset) * c.norm_scale + 0.5f),
+		        uint16_t((u  * c.c_scale + c.c_offset) * c.norm_scale + 0.5f),
+		        uint16_t((v  * c.c_scale + c.c_offset) * c.norm_scale + 0.5f),
+		        kNormMax,
+		};
+	}
+};
+
+template <>
+struct ColorXfm<YUV16, RGB16> {
+	static RGB16 apply(YUV16 yuv, const ColorCoeffs& c) noexcept
+	{
+		const float yp = (float(yuv.y) * c.norm_inv_scale - c.y_offset) * c.y_inv;
+		const float u  = (float(yuv.u) * c.norm_inv_scale - c.c_offset) * c.c_inv;
+		const float v  = (float(yuv.v) * c.norm_inv_scale - c.c_offset) * c.c_inv;
+
+		const float r = yp + c.ru * v;
+		const float g = yp + c.gu * u + c.gv * v;
+		const float b = yp + c.bv * u;
+
+		// Clamp on YUV→RGB: the inverse matrix produces out-of-range
+		// RGB for some valid YUV inputs. Written as min/max so it
+		// vectorizes to vminps/vmaxps; std::clamp can defeat that.
+		auto pack = [&](float x) -> uint16_t {
+				    x = x * c.norm_scale + 0.5f;
+				    x = std::min(std::max(x, 0.0f), c.norm_scale);
+				    return uint16_t(x);
+			    };
+
+		return RGB16{
+		        pack(r), pack(g), pack(b),
+		        kNormMax,
+		};
+	}
+};
+
+// In-place cross-color-kind passes over a normalized line buffer.
+// RGB16 and YUV16 are both 4 uint16_t with identical layout, so we
+// can memcpy through the same buffer pixel-by-pixel without aliasing.
+inline void norm_rgb_to_yuv(uint8_t* buf, size_t n, const ColorCoeffs& c) noexcept
+{
+	for (size_t i = 0; i < n; ++i) {
+		RGB16 rgb;
+		std::memcpy(&rgb, buf + i * sizeof(RGB16), sizeof(RGB16));
+		YUV16 yuv = ColorXfm<RGB16, YUV16>::apply(rgb, c);
+		std::memcpy(buf + i * sizeof(YUV16), &yuv, sizeof(YUV16));
+	}
+}
+
+inline void norm_yuv_to_rgb(uint8_t* buf, size_t n, const ColorCoeffs& c) noexcept
+{
+	for (size_t i = 0; i < n; ++i) {
+		YUV16 yuv;
+		std::memcpy(&yuv, buf + i * sizeof(YUV16), sizeof(YUV16));
+		RGB16 rgb = ColorXfm<YUV16, RGB16>::apply(yuv, c);
+		std::memcpy(buf + i * sizeof(RGB16), &rgb, sizeof(RGB16));
+	}
+}
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/error.h b/subprojects/pixpat/pixpat-native/src/error.h
new file mode 100644
index 0000000..83a3596
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/error.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdexcept>
+
+namespace pixpat
+{
+
+struct error : std::runtime_error {
+	using std::runtime_error::runtime_error;
+};
+
+struct invalid_argument : error {
+	using error::error;
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/format_catalog.h b/subprojects/pixpat/pixpat-native/src/format_catalog.h
new file mode 100644
index 0000000..287d773
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/format_catalog.h
@@ -0,0 +1,140 @@
+#pragma once
+
+// Catalog of every pixel format the C++ side handles. The X-macro is a
+// flat list of names:
+//
+//   X(name)
+//
+// `name` is the canonical format identifier — both an internal FormatId
+// enum entry and the public string accepted by pixpat_buffer::format.
+// Each name resolves to a struct in `pixpat::formats::` (defined under
+// pixpat-native/src/formats/) that carries:
+//
+//   - the layout (subsampling, planes, components)
+//   - nested `Source` / `Sink` aliases for the matching I/O templates
+//
+// Adding a format = a row here AND a struct in the right
+// pixpat-native/src/formats/*.h. The codegen
+// (pixpat-native/codegen/gen_pixpat.py) parses this X-macro to learn
+// the format set; pixpat.cpp re-expands it to build s_format_info via
+// `formats::name::Source` / `formats::name::Sink`.
+//
+// FormatId is internal — the public C ABI deals in format names only.
+
+#include <cstddef>
+
+namespace pixpat
+{
+
+#define PIXPAT_FORMAT_LIST(X) \
+	X(XRGB8888)           \
+	X(ARGB8888)           \
+	X(XBGR8888)           \
+	X(ABGR8888)           \
+	X(RGBX8888)           \
+	X(RGBA8888)           \
+	X(BGRX8888)           \
+	X(BGRA8888)           \
+	X(RGB888)             \
+	X(BGR888)             \
+	X(RGB332)             \
+	X(RGB565)             \
+	X(BGR565)             \
+	X(XRGB1555)           \
+	X(ARGB1555)           \
+	X(XBGR1555)           \
+	X(ABGR1555)           \
+	X(XRGB4444)           \
+	X(ARGB4444)           \
+	X(XBGR4444)           \
+	X(ABGR4444)           \
+	X(RGBX4444)           \
+	X(RGBA4444)           \
+	X(XRGB2101010)        \
+	X(ARGB2101010)        \
+	X(XBGR2101010)        \
+	X(ABGR2101010)        \
+	X(RGBX1010102)        \
+	X(RGBA1010102)        \
+	X(BGRX1010102)        \
+	X(BGRA1010102)        \
+	X(ABGR16161616)       \
+	X(NV12)               \
+	X(NV21)               \
+	X(NV16)               \
+	X(NV61)               \
+	X(P030)               \
+	X(P230)               \
+	X(YUV420)             \
+	X(YVU420)             \
+	X(YUV422)             \
+	X(YVU422)             \
+	X(YUV444)             \
+	X(YVU444)             \
+	X(T430)               \
+	X(VUY888)             \
+	X(XVUY8888)           \
+	X(XVUY2101010)        \
+	X(AVUY16161616)       \
+	X(YUYV)               \
+	X(YVYU)               \
+	X(UYVY)               \
+	X(VYUY)               \
+	X(Y210)               \
+	X(Y212)               \
+	X(Y216)               \
+	X(Y8)                 \
+	X(Y10)                \
+	X(Y12)                \
+	X(Y16)                \
+	X(R8)                 \
+	X(XYYY2101010)        \
+	X(Y10P)               \
+	X(Y12P)               \
+	X(SRGGB8)             \
+	X(SBGGR8)             \
+	X(SGRBG8)             \
+	X(SGBRG8)             \
+	X(SRGGB10)            \
+	X(SBGGR10)            \
+	X(SGRBG10)            \
+	X(SGBRG10)            \
+	X(SRGGB12)            \
+	X(SBGGR12)            \
+	X(SGRBG12)            \
+	X(SGBRG12)            \
+	X(SRGGB16)            \
+	X(SBGGR16)            \
+	X(SGRBG16)            \
+	X(SGBRG16)            \
+	X(SRGGB10P)           \
+	X(SBGGR10P)           \
+	X(SGRBG10P)           \
+	X(SGBRG10P)           \
+	X(SRGGB12P)           \
+	X(SBGGR12P)           \
+	X(SGRBG12P)           \
+	X(SGBRG12P)
+
+enum class FormatId {
+#define X(name) name,
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+	Unknown,
+};
+
+struct FormatEntry {
+	const char* name;
+	FormatId id;
+};
+
+inline constexpr FormatEntry s_format_table[] = {
+#define X(name) { #name, FormatId::name },
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+};
+
+inline constexpr size_t s_format_catalog_count =
+	sizeof(s_format_table) / sizeof(s_format_table[0]);
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/formats.h b/subprojects/pixpat/pixpat-native/src/formats.h
new file mode 100644
index 0000000..68bdeec
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// Aggregator: every named layout the X-macro registers lives in one of
+// the headers under formats/, organized by color kind. Format names
+// follow the kms++/pixutils convention (see formats/rgb.h for the
+// longer note; the YUYV group is an exception, see formats/yuv_packed.h).
+
+#include "formats/rgb.h"
+#include "formats/yuv_semiplanar.h"
+#include "formats/yuv_planar.h"
+#include "formats/yuv_packed.h"
+#include "formats/grayscale.h"
+#include "formats/bayer.h"
diff --git a/subprojects/pixpat/pixpat-native/src/formats/bayer.h b/subprojects/pixpat/pixpat-native/src/formats/bayer.h
new file mode 100644
index 0000000..057c342
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/bayer.h
@@ -0,0 +1,97 @@
+#pragma once
+
+// Bayer raw layouts. Each pixel carries one of R/G/B selected by
+// (x mod 2, y mod 2) and BayerOrder; the pattern is on the
+// BayerSource/BayerSink template, not the layout itself. Storage shape
+// is single-component (C::Y reused as the storage tag) so the same
+// 8/10/12/16-bit shapes apply across all four phase patterns.
+//
+// Each format is a distinct struct (rather than a type alias of one
+// another) so each format type can carry its own pattern-specific
+// Source/Sink aliases. The shared bit layout lives in a base struct per
+// (depth,packing) combination.
+//
+// ColorKind is RGB because the normalized pixel passed through ColorXfm
+// is RGB16 — the sink picks one of r/g/b at write time, and the
+// source nearest-neighbor demosaics into RGB16 at read time.
+
+#include "../layout.h"
+#include "../io/bayer.h"
+
+namespace pixpat::formats
+{
+
+namespace bayer_detail
+{
+
+// Per-(depth,packing) base layouts. Every Bayer format derives from
+// one of these and pins its own pattern-specific I/O templates.
+using Bayer8   = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint8_t,  Comp { C::Y, 8, 0 }> >;
+using Bayer10  = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint16_t, Comp { C::Y, 10, 0 }, Comp { C::X, 6, 10 }> >;
+using Bayer12  = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint16_t, Comp { C::Y, 12, 0 }, Comp { C::X, 4, 12 }> >;
+using Bayer16  = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint16_t, Comp { C::Y, 16, 0 }> >;
+// MIPI CSI-2 packed Bayer (10P: 4 pix in 5 bytes; 12P: 2 pix in 3
+// bytes). The Layout doesn't capture the packed bit layout — the
+// BayerPackedSink hand-rolls the byte writes. uint8_t plane shape is
+// a placeholder so the dispatch plumbing is uniform.
+using Bayer10P = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint8_t,  Comp { C::Y, 8, 0 }> >;
+using Bayer12P = Layout<ColorKind::RGB, 1, 1,
+                        Plane<uint8_t,  Comp { C::Y, 8, 0 }> >;
+
+} // namespace bayer_detail
+
+// Unpacked Bayer (4 patterns × 4 bit depths).
+#define PIXPAT_BAYER(name, base, pat)                     \
+	struct name : bayer_detail::base {                \
+		using Source = BayerSource_ ## pat<name>; \
+		using Sink   = BayerSink_ ## pat<name>;   \
+	}
+
+PIXPAT_BAYER(SRGGB8,  Bayer8,  RGGB);
+PIXPAT_BAYER(SBGGR8,  Bayer8,  BGGR);
+PIXPAT_BAYER(SGRBG8,  Bayer8,  GRBG);
+PIXPAT_BAYER(SGBRG8,  Bayer8,  GBRG);
+
+PIXPAT_BAYER(SRGGB10, Bayer10, RGGB);
+PIXPAT_BAYER(SBGGR10, Bayer10, BGGR);
+PIXPAT_BAYER(SGRBG10, Bayer10, GRBG);
+PIXPAT_BAYER(SGBRG10, Bayer10, GBRG);
+
+PIXPAT_BAYER(SRGGB12, Bayer12, RGGB);
+PIXPAT_BAYER(SBGGR12, Bayer12, BGGR);
+PIXPAT_BAYER(SGRBG12, Bayer12, GRBG);
+PIXPAT_BAYER(SGBRG12, Bayer12, GBRG);
+
+PIXPAT_BAYER(SRGGB16, Bayer16, RGGB);
+PIXPAT_BAYER(SBGGR16, Bayer16, BGGR);
+PIXPAT_BAYER(SGRBG16, Bayer16, GRBG);
+PIXPAT_BAYER(SGBRG16, Bayer16, GBRG);
+
+#undef PIXPAT_BAYER
+
+// MIPI-packed Bayer: pattern + bit depth both encoded in the I/O
+// template name (BayerPackedSource_RGGB10, ...).
+#define PIXPAT_BAYER_PACKED(name, base, pat_depth)                    \
+	struct name : bayer_detail::base {                            \
+		using Source = BayerPackedSource_ ## pat_depth<name>; \
+		using Sink   = BayerPackedSink_ ## pat_depth<name>;   \
+	}
+
+PIXPAT_BAYER_PACKED(SRGGB10P, Bayer10P, RGGB10);
+PIXPAT_BAYER_PACKED(SBGGR10P, Bayer10P, BGGR10);
+PIXPAT_BAYER_PACKED(SGRBG10P, Bayer10P, GRBG10);
+PIXPAT_BAYER_PACKED(SGBRG10P, Bayer10P, GBRG10);
+
+PIXPAT_BAYER_PACKED(SRGGB12P, Bayer12P, RGGB12);
+PIXPAT_BAYER_PACKED(SBGGR12P, Bayer12P, BGGR12);
+PIXPAT_BAYER_PACKED(SGRBG12P, Bayer12P, GRBG12);
+PIXPAT_BAYER_PACKED(SGBRG12P, Bayer12P, GBRG12);
+
+#undef PIXPAT_BAYER_PACKED
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/formats/grayscale.h b/subprojects/pixpat/pixpat-native/src/formats/grayscale.h
new file mode 100644
index 0000000..b1cd294
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/grayscale.h
@@ -0,0 +1,78 @@
+#pragma once
+
+// Single-component-per-pixel formats. Most are grayscale (Y) modeled as
+// a YUV format with synthesized neutral chroma; R8 is the RGB-kind
+// counterpart, modeled grey-style with G=B=R on read. Y10/Y12 carry an
+// explicit X padding bitfield. XYYY2101010 is multi-pixel-per-word: 3 Y
+// samples in 32 bits.
+
+#include "../layout.h"
+#include "../io/gray.h"
+#include "../io/gray_packed.h"
+#include "../io/mono_rgb.h"
+
+namespace pixpat::formats
+{
+
+#define PIXPAT_GRAY(name, ...)                                    \
+	struct name : Layout<ColorKind::YUV, 1, 1, __VA_ARGS__> { \
+		using Source = GraySource<name>;                  \
+		using Sink   = GraySink<name>;                    \
+	}
+
+PIXPAT_GRAY(Y8,
+            Plane<uint8_t,  Comp{ C::Y, 8, 0 }>);
+
+PIXPAT_GRAY(Y10,
+            Plane<uint16_t, Comp{ C::Y, 10, 0 }, Comp{ C::X, 6, 10 }>);
+
+PIXPAT_GRAY(Y12,
+            Plane<uint16_t, Comp{ C::Y, 12, 0 }, Comp{ C::X, 4, 12 }>);
+
+PIXPAT_GRAY(Y16,
+            Plane<uint16_t, Comp{ C::Y, 16, 0 }>);
+
+#undef PIXPAT_GRAY
+
+// R8: single 8-bit R channel. Read synthesizes G=B=R; write encodes R
+// and drops G/B/A. Symmetric to Y8 but ColorKind::RGB so cross-pipeline
+// conversions go through the RGB->YUV ColorXfm direction.
+struct R8 : Layout<ColorKind::RGB, 1, 1,
+	           Plane<uint8_t, Comp{ C::R, 8, 0 }> > {
+	using Source = MonoRGBSource<R8>;
+	using Sink   = MonoRGBSink<R8>;
+};
+
+struct XYYY2101010 : Layout<ColorKind::YUV, 1, 1,
+	                    Plane<uint32_t,
+	                          Comp{ C::Y, 10, 0 },
+	                          Comp{ C::Y, 10, 10 },
+	                          Comp{ C::Y, 10, 20 },
+	                          Comp{ C::X, 2,  30 }> > {
+	using Source = MultiPixelGraySource<XYYY2101010>;
+	using Sink   = MultiPixelGraySink<XYYY2101010>;
+};
+
+// MIPI CSI-2 packed grayscale (Y10P / Y12P). The Layout doesn't capture
+// the packed bit layout — GrayPackedSource/Sink delegate to the shared
+// CSI-2 helper (io/csi2.h). uint8_t plane shape is a placeholder so
+// dispatch plumbing is uniform (mirrors bayer_detail::Bayer10P/12P).
+namespace gray_csi2_detail
+{
+using Gray10P = Layout<ColorKind::YUV, 1, 1,
+                       Plane<uint8_t, Comp { C::Y, 8, 0 }> >;
+using Gray12P = Layout<ColorKind::YUV, 1, 1,
+                       Plane<uint8_t, Comp { C::Y, 8, 0 }> >;
+} // namespace gray_csi2_detail
+
+struct Y10P : gray_csi2_detail::Gray10P {
+	using Source = GrayPackedSource<Y10P, 10>;
+	using Sink   = GrayPackedSink<Y10P, 10>;
+};
+
+struct Y12P : gray_csi2_detail::Gray12P {
+	using Source = GrayPackedSource<Y12P, 12>;
+	using Sink   = GrayPackedSink<Y12P, 12>;
+};
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/formats/rgb.h b/subprojects/pixpat/pixpat-native/src/formats/rgb.h
new file mode 100644
index 0000000..19d007a
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/rgb.h
@@ -0,0 +1,267 @@
+#pragma once
+
+// RGB packed layouts: 8-bit / 16-bit (sub-byte) / 32-bit (10-bit) /
+// 64-bit-normalized, all single-plane single-pixel-per-storage-word.
+// Names follow the kms++/pixutils register-order convention (MSB-first
+// in the storage word), so XRGB8888 has X at bits 31..24 and B at 7..0.
+
+#include "../layout.h"
+#include "../io/packed.h"
+
+namespace pixpat::formats
+{
+
+// Helper: every format in this file pairs with PackedSource/PackedSink.
+// Each format struct exposes Source / Sink aliases so the catalog row
+// in format_catalog.h can stay name-only.
+#define PIXPAT_RGB_PACKED(name, ...)                              \
+	struct name : Layout<ColorKind::RGB, 1, 1, __VA_ARGS__> { \
+		using Source = PackedSource<name>;                \
+		using Sink   = PackedSink<name>;                  \
+	}
+
+// ---------------------------------------------------------------------
+// 32-bit packed RGB, 8-bit components.
+// ---------------------------------------------------------------------
+
+PIXPAT_RGB_PACKED(XRGB8888,
+                  Plane<uint32_t,
+                        Comp{ C::B, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::R, 8, 16 },
+                        Comp{ C::X, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(ARGB8888,
+                  Plane<uint32_t,
+                        Comp{ C::B, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::R, 8, 16 },
+                        Comp{ C::A, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(XBGR8888,
+                  Plane<uint32_t,
+                        Comp{ C::R, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::B, 8, 16 },
+                        Comp{ C::X, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(ABGR8888,
+                  Plane<uint32_t,
+                        Comp{ C::R, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::B, 8, 16 },
+                        Comp{ C::A, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(RGBX8888,
+                  Plane<uint32_t,
+                        Comp{ C::X, 8, 0 },
+                        Comp{ C::B, 8, 8 },
+                        Comp{ C::G, 8, 16 },
+                        Comp{ C::R, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(RGBA8888,
+                  Plane<uint32_t,
+                        Comp{ C::A, 8, 0 },
+                        Comp{ C::B, 8, 8 },
+                        Comp{ C::G, 8, 16 },
+                        Comp{ C::R, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(BGRX8888,
+                  Plane<uint32_t,
+                        Comp{ C::X, 8, 0 },
+                        Comp{ C::R, 8, 8 },
+                        Comp{ C::G, 8, 16 },
+                        Comp{ C::B, 8, 24 }>);
+
+PIXPAT_RGB_PACKED(BGRA8888,
+                  Plane<uint32_t,
+                        Comp{ C::A, 8, 0 },
+                        Comp{ C::R, 8, 8 },
+                        Comp{ C::G, 8, 16 },
+                        Comp{ C::B, 8, 24 }>);
+
+// ---------------------------------------------------------------------
+// 24-bit packed RGB, three bytes per pixel. storage_t is uint32_t but
+// only bytes_per_pixel = 3 are read/written via memcpy.
+// ---------------------------------------------------------------------
+
+PIXPAT_RGB_PACKED(RGB888,
+                  Plane<uint32_t,
+                        Comp{ C::B, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::R, 8, 16 }>);
+
+PIXPAT_RGB_PACKED(BGR888,
+                  Plane<uint32_t,
+                        Comp{ C::R, 8, 0 },
+                        Comp{ C::G, 8, 8 },
+                        Comp{ C::B, 8, 16 }>);
+
+// ---------------------------------------------------------------------
+// 16-bit packed RGB, sub-byte components.
+// ---------------------------------------------------------------------
+
+PIXPAT_RGB_PACKED(RGB565,
+                  Plane<uint16_t,
+                        Comp{ C::B, 5, 0 },
+                        Comp{ C::G, 6, 5 },
+                        Comp{ C::R, 5, 11 }>);
+
+PIXPAT_RGB_PACKED(BGR565,
+                  Plane<uint16_t,
+                        Comp{ C::R, 5, 0 },
+                        Comp{ C::G, 6, 5 },
+                        Comp{ C::B, 5, 11 }>);
+
+// 8-bit packed RGB: 3-bit R / 3-bit G / 2-bit B in a single byte.
+
+PIXPAT_RGB_PACKED(RGB332,
+                  Plane<uint8_t,
+                        Comp{ C::B, 2, 0 },
+                        Comp{ C::G, 3, 2 },
+                        Comp{ C::R, 3, 5 }>);
+
+PIXPAT_RGB_PACKED(XRGB1555,
+                  Plane<uint16_t,
+                        Comp{ C::B, 5, 0 },
+                        Comp{ C::G, 5, 5 },
+                        Comp{ C::R, 5, 10 },
+                        Comp{ C::X, 1, 15 }>);
+
+PIXPAT_RGB_PACKED(ARGB1555,
+                  Plane<uint16_t,
+                        Comp{ C::B, 5, 0 },
+                        Comp{ C::G, 5, 5 },
+                        Comp{ C::R, 5, 10 },
+                        Comp{ C::A, 1, 15 }>);
+
+PIXPAT_RGB_PACKED(XBGR1555,
+                  Plane<uint16_t,
+                        Comp{ C::R, 5, 0 },
+                        Comp{ C::G, 5, 5 },
+                        Comp{ C::B, 5, 10 },
+                        Comp{ C::X, 1, 15 }>);
+
+PIXPAT_RGB_PACKED(ABGR1555,
+                  Plane<uint16_t,
+                        Comp{ C::R, 5, 0 },
+                        Comp{ C::G, 5, 5 },
+                        Comp{ C::B, 5, 10 },
+                        Comp{ C::A, 1, 15 }>);
+
+PIXPAT_RGB_PACKED(XRGB4444,
+                  Plane<uint16_t,
+                        Comp{ C::B, 4, 0 },
+                        Comp{ C::G, 4, 4 },
+                        Comp{ C::R, 4, 8 },
+                        Comp{ C::X, 4, 12 }>);
+
+PIXPAT_RGB_PACKED(ARGB4444,
+                  Plane<uint16_t,
+                        Comp{ C::B, 4, 0 },
+                        Comp{ C::G, 4, 4 },
+                        Comp{ C::R, 4, 8 },
+                        Comp{ C::A, 4, 12 }>);
+
+PIXPAT_RGB_PACKED(XBGR4444,
+                  Plane<uint16_t,
+                        Comp{ C::R, 4, 0 },
+                        Comp{ C::G, 4, 4 },
+                        Comp{ C::B, 4, 8 },
+                        Comp{ C::X, 4, 12 }>);
+
+PIXPAT_RGB_PACKED(ABGR4444,
+                  Plane<uint16_t,
+                        Comp{ C::R, 4, 0 },
+                        Comp{ C::G, 4, 4 },
+                        Comp{ C::B, 4, 8 },
+                        Comp{ C::A, 4, 12 }>);
+
+PIXPAT_RGB_PACKED(RGBX4444,
+                  Plane<uint16_t,
+                        Comp{ C::X, 4, 0 },
+                        Comp{ C::B, 4, 4 },
+                        Comp{ C::G, 4, 8 },
+                        Comp{ C::R, 4, 12 }>);
+
+PIXPAT_RGB_PACKED(RGBA4444,
+                  Plane<uint16_t,
+                        Comp{ C::A, 4, 0 },
+                        Comp{ C::B, 4, 4 },
+                        Comp{ C::G, 4, 8 },
+                        Comp{ C::R, 4, 12 }>);
+
+// ---------------------------------------------------------------------
+// 32-bit packed RGB, 10-bit components.
+// ---------------------------------------------------------------------
+
+PIXPAT_RGB_PACKED(XRGB2101010,
+                  Plane<uint32_t,
+                        Comp{ C::B, 10, 0 },
+                        Comp{ C::G, 10, 10 },
+                        Comp{ C::R, 10, 20 },
+                        Comp{ C::X, 2, 30 }>);
+
+PIXPAT_RGB_PACKED(ARGB2101010,
+                  Plane<uint32_t,
+                        Comp{ C::B, 10, 0 },
+                        Comp{ C::G, 10, 10 },
+                        Comp{ C::R, 10, 20 },
+                        Comp{ C::A, 2, 30 }>);
+
+PIXPAT_RGB_PACKED(XBGR2101010,
+                  Plane<uint32_t,
+                        Comp{ C::R, 10, 0 },
+                        Comp{ C::G, 10, 10 },
+                        Comp{ C::B, 10, 20 },
+                        Comp{ C::X, 2, 30 }>);
+
+PIXPAT_RGB_PACKED(ABGR2101010,
+                  Plane<uint32_t,
+                        Comp{ C::R, 10, 0 },
+                        Comp{ C::G, 10, 10 },
+                        Comp{ C::B, 10, 20 },
+                        Comp{ C::A, 2, 30 }>);
+
+PIXPAT_RGB_PACKED(RGBX1010102,
+                  Plane<uint32_t,
+                        Comp{ C::X, 2, 0 },
+                        Comp{ C::B, 10, 2 },
+                        Comp{ C::G, 10, 12 },
+                        Comp{ C::R, 10, 22 }>);
+
+PIXPAT_RGB_PACKED(RGBA1010102,
+                  Plane<uint32_t,
+                        Comp{ C::A, 2, 0 },
+                        Comp{ C::B, 10, 2 },
+                        Comp{ C::G, 10, 12 },
+                        Comp{ C::R, 10, 22 }>);
+
+PIXPAT_RGB_PACKED(BGRX1010102,
+                  Plane<uint32_t,
+                        Comp{ C::X, 2, 0 },
+                        Comp{ C::R, 10, 2 },
+                        Comp{ C::G, 10, 12 },
+                        Comp{ C::B, 10, 22 }>);
+
+PIXPAT_RGB_PACKED(BGRA1010102,
+                  Plane<uint32_t,
+                        Comp{ C::A, 2, 0 },
+                        Comp{ C::R, 10, 2 },
+                        Comp{ C::G, 10, 12 },
+                        Comp{ C::B, 10, 22 }>);
+
+// ---------------------------------------------------------------------
+// 64-bit normalized wide RGB (16 bits per component).
+// ---------------------------------------------------------------------
+
+PIXPAT_RGB_PACKED(ABGR16161616,
+                  Plane<uint64_t,
+                        Comp{ C::R, 16, 0 },
+                        Comp{ C::G, 16, 16 },
+                        Comp{ C::B, 16, 32 },
+                        Comp{ C::A, 16, 48 }>);
+
+#undef PIXPAT_RGB_PACKED
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/formats/yuv_packed.h b/subprojects/pixpat/pixpat-native/src/formats/yuv_packed.h
new file mode 100644
index 0000000..8e88f10
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/yuv_packed.h
@@ -0,0 +1,136 @@
+#pragma once
+
+// Packed YUV layouts:
+//   VUY888        — 1 pixel / 24-bit, 8-bit Y/U/V (storage uint32_t,
+//                   bytes_per_pixel = 3; parallels BGR888 in the YUV
+//                   register order)
+//   XVUY8888      — 1 pixel / 32-bit word, 8-bit Y/U/V + 8-bit padding
+//   XVUY2101010   — 1 pixel / 32-bit word, 10-bit Y/U/V + 2-bit padding
+//   AVUY16161616  — 1 pixel / 64-bit word, 16-bit Y/U/V/A (normalized)
+//   YUYV / YVYU / UYVY / VYUY — 4:2:2, 2 pixels / 32-bit word
+//   Y210 / Y212 / Y216        — 4:2:2, 2 pixels / 64-bit word, with
+//                   each component MSB-aligned in a 16-bit slot
+//
+// XVUY/AVUY name is register MSB-first (X/A in the top bits). The
+// YUYV names follow V4L2 / pixpat memory-byte order (Y0 in byte 0),
+// so shifts ascend in name order — opposite of XRGB-style.
+
+#include "../layout.h"
+#include "../io/packed.h"
+#include "../io/packed_yuv.h"
+
+namespace pixpat::formats
+{
+
+// 1-pixel-per-word packed (single Pixel/Word; uses PackedSource/Sink).
+
+struct VUY888 : Layout<ColorKind::YUV, 1, 1,
+	               Plane<uint32_t,
+	                     Comp{ C::Y, 8, 0 },
+	                     Comp{ C::U, 8, 8 },
+	                     Comp{ C::V, 8, 16 }> > {
+	using Source = PackedSource<VUY888>;
+	using Sink   = PackedSink<VUY888>;
+};
+
+struct XVUY8888 : Layout<ColorKind::YUV, 1, 1,
+	                 Plane<uint32_t,
+	                       Comp{ C::Y, 8, 0 },
+	                       Comp{ C::U, 8, 8 },
+	                       Comp{ C::V, 8, 16 },
+	                       Comp{ C::X, 8, 24 }> > {
+	using Source = PackedSource<XVUY8888>;
+	using Sink   = PackedSink<XVUY8888>;
+};
+
+struct XVUY2101010 : Layout<ColorKind::YUV, 1, 1,
+	                    Plane<uint32_t,
+	                          Comp{ C::Y, 10, 0 },
+	                          Comp{ C::U, 10, 10 },
+	                          Comp{ C::V, 10, 20 },
+	                          Comp{ C::X, 2,  30 }> > {
+	using Source = PackedSource<XVUY2101010>;
+	using Sink   = PackedSink<XVUY2101010>;
+};
+
+struct AVUY16161616 : Layout<ColorKind::YUV, 1, 1,
+	                     Plane<uint64_t,
+	                           Comp{ C::Y, 16, 0 },
+	                           Comp{ C::U, 16, 16 },
+	                           Comp{ C::V, 16, 32 },
+	                           Comp{ C::A, 16, 48 }> > {
+	using Source = PackedSource<AVUY16161616>;
+	using Sink   = PackedSink<AVUY16161616>;
+};
+
+// 2-pixel-per-word 4:2:2 (uses PackedYUVSource/Sink).
+
+#define PIXPAT_PACKED_YUV422(name, ...)                       \
+	struct name : Layout<ColorKind::YUV, 2, 1,            \
+			     Plane<uint32_t, __VA_ARGS__> > { \
+		using Source = PackedYUVSource<name>;         \
+		using Sink   = PackedYUVSink<name>;           \
+	}
+
+PIXPAT_PACKED_YUV422(YUYV,
+                     Comp{ C::Y, 8, 0 }, Comp{ C::U, 8, 8 },
+                     Comp{ C::Y, 8, 16 }, Comp{ C::V, 8, 24 });
+
+PIXPAT_PACKED_YUV422(YVYU,
+                     Comp{ C::Y, 8, 0 }, Comp{ C::V, 8, 8 },
+                     Comp{ C::Y, 8, 16 }, Comp{ C::U, 8, 24 });
+
+PIXPAT_PACKED_YUV422(UYVY,
+                     Comp{ C::U, 8, 0 }, Comp{ C::Y, 8, 8 },
+                     Comp{ C::V, 8, 16 }, Comp{ C::Y, 8, 24 });
+
+PIXPAT_PACKED_YUV422(VYUY,
+                     Comp{ C::V, 8, 0 }, Comp{ C::Y, 8, 8 },
+                     Comp{ C::U, 8, 16 }, Comp{ C::Y, 8, 24 });
+
+#undef PIXPAT_PACKED_YUV422
+
+// Y210 / Y212 / Y216: 4:2:2, 2 pixels per 64-bit word, MSB-aligned in
+// 16-bit slots. Y210 has 6 unused LSBs per slot, Y212 has 4, Y216 has
+// none. The X padding entries pad total_bits to 64 so bytes_per_pixel
+// resolves to 8; PackedYUVSink leaves their slots zero via the
+// value-array zero-init (see io/packed_yuv.h).
+struct Y210 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint64_t,
+	                   Comp{ C::X,  6,  0 },
+	                   Comp{ C::Y, 10,  6 },
+	                   Comp{ C::X,  6, 16 },
+	                   Comp{ C::U, 10, 22 },
+	                   Comp{ C::X,  6, 32 },
+	                   Comp{ C::Y, 10, 38 },
+	                   Comp{ C::X,  6, 48 },
+	                   Comp{ C::V, 10, 54 }> > {
+	using Source = PackedYUVSource<Y210>;
+	using Sink   = PackedYUVSink<Y210>;
+};
+
+struct Y212 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint64_t,
+	                   Comp{ C::X,  4,  0 },
+	                   Comp{ C::Y, 12,  4 },
+	                   Comp{ C::X,  4, 16 },
+	                   Comp{ C::U, 12, 20 },
+	                   Comp{ C::X,  4, 32 },
+	                   Comp{ C::Y, 12, 36 },
+	                   Comp{ C::X,  4, 48 },
+	                   Comp{ C::V, 12, 52 }> > {
+	using Source = PackedYUVSource<Y212>;
+	using Sink   = PackedYUVSink<Y212>;
+};
+
+struct Y216 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint64_t,
+	                   Comp{ C::Y, 16,  0 },
+	                   Comp{ C::U, 16, 16 },
+	                   Comp{ C::Y, 16, 32 },
+	                   Comp{ C::V, 16, 48 }> > {
+	using Source = PackedYUVSource<Y216>;
+	using Sink   = PackedYUVSink<Y216>;
+};
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/formats/yuv_planar.h b/subprojects/pixpat/pixpat-native/src/formats/yuv_planar.h
new file mode 100644
index 0000000..bb6a415
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/yuv_planar.h
@@ -0,0 +1,76 @@
+#pragma once
+
+// YUV planar layouts: 3 separate planes (Y, then U/V or V/U), 8-bit
+// components.
+//   YUV420/YVU420 — h_sub=2, v_sub=2  (a.k.a. I420 / YV12)
+//   YUV422/YVU422 — h_sub=2, v_sub=1
+//   YUV444/YVU444 — h_sub=1, v_sub=1
+//   T430          — multi-pixel-per-word planar 4:4:4.
+
+#include "../layout.h"
+#include "../io/planar.h"
+
+namespace pixpat::formats
+{
+
+#define PIXPAT_PLANAR(name, ...)                            \
+	struct name : Layout<ColorKind::YUV, __VA_ARGS__> { \
+		using Source = PlanarSource<name>;          \
+		using Sink   = PlanarSink<name>;            \
+	}
+
+PIXPAT_PLANAR(YUV420, 2, 2,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>);
+
+PIXPAT_PLANAR(YVU420, 2, 2,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>);
+
+PIXPAT_PLANAR(YUV422, 2, 1,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>);
+
+PIXPAT_PLANAR(YVU422, 2, 1,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>);
+
+PIXPAT_PLANAR(YUV444, 1, 1,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>);
+
+PIXPAT_PLANAR(YVU444, 1, 1,
+              Plane<uint8_t, Comp{ C::Y, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::V, 8, 0 }>,
+              Plane<uint8_t, Comp{ C::U, 8, 0 }>);
+
+#undef PIXPAT_PLANAR
+
+// T430: 3-plane multi-pixel-per-word planar 4:4:4. Each plane carries
+// 3 × 10-bit samples per uint32_t plus a 2-bit X padding bit-field.
+struct T430 : Layout<ColorKind::YUV, 1, 1,
+	             Plane<uint32_t,
+	                   Comp{ C::Y, 10, 0 },
+	                   Comp{ C::Y, 10, 10 },
+	                   Comp{ C::Y, 10, 20 },
+	                   Comp{ C::X, 2,  30 }>,
+	             Plane<uint32_t,
+	                   Comp{ C::U, 10, 0 },
+	                   Comp{ C::U, 10, 10 },
+	                   Comp{ C::U, 10, 20 },
+	                   Comp{ C::X, 2,  30 }>,
+	             Plane<uint32_t,
+	                   Comp{ C::V, 10, 0 },
+	                   Comp{ C::V, 10, 10 },
+	                   Comp{ C::V, 10, 20 },
+	                   Comp{ C::X, 2,  30 }> > {
+	using Source = MultiPixelPlanarSource<T430>;
+	using Sink   = MultiPixelPlanarSink<T430>;
+};
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/formats/yuv_semiplanar.h b/subprojects/pixpat/pixpat-native/src/formats/yuv_semiplanar.h
new file mode 100644
index 0000000..34aea22
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/formats/yuv_semiplanar.h
@@ -0,0 +1,79 @@
+#pragma once
+
+// YUV semiplanar layouts: Y plane + interleaved UV plane.
+//   NV12/NV21 — 4:2:0 (h_sub=2, v_sub=2)
+//   NV16/NV61 — 4:2:2 (h_sub=2, v_sub=1)
+//   P030/P230 — multi-pixel-per-word semiplanar (10-bit Y triplets).
+
+#include "../layout.h"
+#include "../io/semiplanar.h"
+
+namespace pixpat::formats
+{
+
+struct NV12 : Layout<ColorKind::YUV, 2, 2,
+	             Plane<uint8_t,  Comp{ C::Y, 8, 0 }>,
+	             Plane<uint16_t, Comp{ C::U, 8, 0 }, Comp{ C::V, 8, 8 }> > {
+	using Source = SemiplanarSource<NV12>;
+	using Sink   = SemiplanarSink<NV12>;
+};
+
+struct NV21 : Layout<ColorKind::YUV, 2, 2,
+	             Plane<uint8_t,  Comp{ C::Y, 8, 0 }>,
+	             Plane<uint16_t, Comp{ C::V, 8, 0 }, Comp{ C::U, 8, 8 }> > {
+	using Source = SemiplanarSource<NV21>;
+	using Sink   = SemiplanarSink<NV21>;
+};
+
+struct NV16 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint8_t,  Comp{ C::Y, 8, 0 }>,
+	             Plane<uint16_t, Comp{ C::U, 8, 0 }, Comp{ C::V, 8, 8 }> > {
+	using Source = SemiplanarSource<NV16>;
+	using Sink   = SemiplanarSink<NV16>;
+};
+
+struct NV61 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint8_t,  Comp{ C::Y, 8, 0 }>,
+	             Plane<uint16_t, Comp{ C::V, 8, 0 }, Comp{ C::U, 8, 8 }> > {
+	using Source = SemiplanarSource<NV61>;
+	using Sink   = SemiplanarSink<NV61>;
+};
+
+// Multi-pixel-per-word semiplanar (P030: 4:2:0, P230: 4:2:2). Y plane
+// holds 3 × 10-bit Y samples per uint32_t (top 2 bits unused). UV plane
+// holds 3 × (Cb,Cr) pairs per uint64_t (10 bits each, with 2-bit gaps
+// at bits 30-31 and 62-63 — left implicit, no X declared).
+
+struct P030 : Layout<ColorKind::YUV, 2, 2,
+	             Plane<uint32_t,
+	                   Comp{ C::Y, 10, 0 },
+	                   Comp{ C::Y, 10, 10 },
+	                   Comp{ C::Y, 10, 20 }>,
+	             Plane<uint64_t,
+	                   Comp{ C::U, 10, 0 },
+	                   Comp{ C::V, 10, 10 },
+	                   Comp{ C::U, 10, 20 },
+	                   Comp{ C::V, 10, 32 },
+	                   Comp{ C::U, 10, 42 },
+	                   Comp{ C::V, 10, 52 }> > {
+	using Source = MultiPixelSemiplanarSource<P030>;
+	using Sink   = MultiPixelSemiplanarSink<P030>;
+};
+
+struct P230 : Layout<ColorKind::YUV, 2, 1,
+	             Plane<uint32_t,
+	                   Comp{ C::Y, 10, 0 },
+	                   Comp{ C::Y, 10, 10 },
+	                   Comp{ C::Y, 10, 20 }>,
+	             Plane<uint64_t,
+	                   Comp{ C::U, 10, 0 },
+	                   Comp{ C::V, 10, 10 },
+	                   Comp{ C::U, 10, 20 },
+	                   Comp{ C::V, 10, 32 },
+	                   Comp{ C::U, 10, 42 },
+	                   Comp{ C::V, 10, 52 }> > {
+	using Source = MultiPixelSemiplanarSource<P230>;
+	using Sink   = MultiPixelSemiplanarSink<P230>;
+};
+
+} // namespace pixpat::formats
diff --git a/subprojects/pixpat/pixpat-native/src/io.h b/subprojects/pixpat/pixpat-native/src/io.h
new file mode 100644
index 0000000..af24232
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// Aggregator: every Source / Sink template lives in one of the
+// per-iteration-shape headers under io/. Encode/decode helpers and
+// load_word/store_word are in io/detail.h, used by all the others.
+
+#include "io/detail.h"
+#include "io/packed.h"
+#include "io/semiplanar.h"
+#include "io/planar.h"
+#include "io/packed_yuv.h"
+#include "io/gray.h"
+#include "io/bayer.h"
diff --git a/subprojects/pixpat/pixpat-native/src/io/bayer.h b/subprojects/pixpat/pixpat-native/src/io/bayer.h
new file mode 100644
index 0000000..6b30c0e
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/bayer.h
@@ -0,0 +1,318 @@
+#pragma once
+
+// Bayer raw read/write support.
+//
+// Write side: each pixel carries one of R/G/B selected by (x mod 2,
+// y mod 2) and a fixed BayerOrder. Two missing channels per pixel are
+// dropped on encode.
+//
+// Read side: bilinear demosaic over a 3x3 window. The pixel's own
+// channel comes from self; missing channels are averaged from the
+// same-channel neighbours that the Bayer phase guarantees to exist:
+//
+//   * At an R or B pixel, all four cardinal (N, E, S, W) neighbours
+//     carry G and all four diagonal (NE, NW, SE, SW) neighbours carry
+//     the other colour, so each missing channel averages four samples.
+//   * At a G pixel, one missing colour sits in the row neighbours
+//     (W, E) and the other in the column neighbours (N, S), so each
+//     missing channel averages two samples.
+//
+// Sampled coordinates are clamped to the image bounds.
+//
+// The Layout shape is the same as a Y-only single-plane format
+// (storage carries one component plus optional X padding); the
+// BayerOrder is a separate template parameter on the Source / Sink.
+
+#include <array>
+#include <cstdint>
+
+#include "../layout.h"
+#include "csi2.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+enum class BayerOrder { RGGB, BGGR, GRBG, GBRG };
+
+namespace detail
+{
+constexpr C bayer_pick(BayerOrder o, bool x_even, bool y_even) noexcept
+{
+	switch (o) {
+	case BayerOrder::RGGB:
+		return y_even ? (x_even ? C::R : C::G)
+		              : (x_even ? C::G : C::B);
+	case BayerOrder::BGGR:
+		return y_even ? (x_even ? C::B : C::G)
+		              : (x_even ? C::G : C::R);
+	case BayerOrder::GRBG:
+		return y_even ? (x_even ? C::G : C::R)
+		              : (x_even ? C::B : C::G);
+	case BayerOrder::GBRG:
+		return y_even ? (x_even ? C::G : C::B)
+		              : (x_even ? C::R : C::G);
+	}
+	return C::G;
+}
+
+constexpr size_t clamp_coord(int v, size_t max_excl) noexcept
+{
+	if (v < 0)
+		return 0;
+	if (size_t(v) >= max_excl)
+		return max_excl - 1;
+	return size_t(v);
+}
+} // namespace detail
+
+template <typename L, BayerOrder Order>
+struct BayerSource {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y_idx = P::template find_pos<C::Y>();
+	static_assert(y_idx < P::num_comps);
+
+	static uint16_t read_sample(const Buffer<1>& buf, size_t x, size_t y) noexcept
+	{
+		const uint8_t* p = buf.data[0] + y * buf.stride[0]
+		                   + x * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+		return detail::decode_norm(P::comps[y_idx].bits, vals[y_idx]);
+	}
+
+	static RGB16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  size_t W, size_t H) noexcept
+	{
+		const bool x_even = (x & 1) == 0;
+		const bool y_even = (y & 1) == 0;
+		const C self = detail::bayer_pick(Order, x_even, y_even);
+
+		const size_t xL = detail::clamp_coord(int(x) - 1, W);
+		const size_t xR = detail::clamp_coord(int(x) + 1, W);
+		const size_t yT = detail::clamp_coord(int(y) - 1, H);
+		const size_t yB = detail::clamp_coord(int(y) + 1, H);
+
+		const uint16_t s = read_sample(buf, x, y);
+
+		uint16_t r = 0, g = 0, b = 0;
+
+		if (self == C::G) {
+			const C h_color = detail::bayer_pick(Order, !x_even, y_even);
+			const uint16_t h_avg = uint16_t(
+				(uint32_t(read_sample(buf, xL, y))
+				 + read_sample(buf, xR, y) + 1u) >> 1);
+			const uint16_t v_avg = uint16_t(
+				(uint32_t(read_sample(buf, x, yT))
+				 + read_sample(buf, x, yB) + 1u) >> 1);
+			g = s;
+			if (h_color == C::R) { r = h_avg; b = v_avg; }
+			else                 { b = h_avg; r = v_avg; }
+		} else {
+			const uint16_t g_avg = uint16_t(
+				(uint32_t(read_sample(buf, x,  yT))
+				 + read_sample(buf, x,  yB)
+				 + read_sample(buf, xL, y)
+				 + read_sample(buf, xR, y) + 2u) >> 2);
+			const uint16_t o_avg = uint16_t(
+				(uint32_t(read_sample(buf, xL, yT))
+				 + read_sample(buf, xR, yT)
+				 + read_sample(buf, xL, yB)
+				 + read_sample(buf, xR, yB) + 2u) >> 2);
+			g = g_avg;
+			if (self == C::R) { r = s;     b = o_avg; }
+			else              { b = s;     r = o_avg; }
+		}
+
+		return RGB16{ r, g, b, uint16_t(0) };
+	}
+};
+
+template <typename L, BayerOrder Order>
+struct BayerSink {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y_idx = P::template find_pos<C::Y>();
+	static constexpr size_t x_idx = P::template find_pos<C::X>();
+	static constexpr bool has_x = (x_idx < P::num_comps);
+	static_assert(y_idx < P::num_comps);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = 1;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const RGB16 (&block)[1][1]) noexcept
+	{
+		const C pick = detail::bayer_pick(Order, (bx & 1) == 0,
+		                                  (by & 1) == 0);
+		const uint16_t val = pick == C::R ? block[0][0].r
+		                   : pick == C::G ? block[0][0].g
+		                                  : block[0][0].b;
+
+		std::array<uint16_t, P::num_comps> v{};
+		v[y_idx] = detail::encode_norm(P::comps[y_idx].bits, val);
+		if constexpr (has_x)
+			v[x_idx] = 0;
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0]
+		             + bx * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+// Aliases so X-macro can register without nested template-template params.
+template <typename L> using BayerSource_RGGB = BayerSource<L, BayerOrder::RGGB>;
+template <typename L> using BayerSource_BGGR = BayerSource<L, BayerOrder::BGGR>;
+template <typename L> using BayerSource_GRBG = BayerSource<L, BayerOrder::GRBG>;
+template <typename L> using BayerSource_GBRG = BayerSource<L, BayerOrder::GBRG>;
+
+template <typename L> using BayerSink_RGGB = BayerSink<L, BayerOrder::RGGB>;
+template <typename L> using BayerSink_BGGR = BayerSink<L, BayerOrder::BGGR>;
+template <typename L> using BayerSink_GRBG = BayerSink<L, BayerOrder::GRBG>;
+template <typename L> using BayerSink_GBRG = BayerSink<L, BayerOrder::GBRG>;
+
+// MIPI CSI-2 packed Bayer. The bit layout doesn't fit
+// `Plane<Storage, Comp...>` because each pixel's bits span two
+// non-contiguous bytes, so we use the shared CSI-2 helper (io/csi2.h)
+// to (un)pack samples.
+//
+// The Layout slot is a placeholder (matches the unpacked Bayer of the
+// same bit-depth so the user-facing API can pick the right buffer
+// shape); bytes_per_pixel from the Plane is unused.
+template <typename L, BayerOrder Order, size_t BitDepth>
+struct BayerPackedSource {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+	static_assert(BitDepth == 10 || BitDepth == 12);
+
+	using Traits = detail::csi2::packed_traits<BitDepth>;
+	static constexpr size_t ppg = Traits::ppg;
+	static constexpr size_t bpg = Traits::bpg;
+
+	// Stored N-bit value upshifts to normalized-16 by `<< (16-N)`,
+	// matching the unpacked Bayer source.
+	static constexpr unsigned shift = 16 - BitDepth;
+
+	static uint16_t read_sample(const Buffer<1>& buf, size_t x, size_t y) noexcept
+	{
+		const uint8_t* src = buf.data[0] + y * buf.stride[0]
+		                     + (x / ppg) * bpg;
+		const uint16_t val = detail::csi2::unpack_sample<BitDepth>(src, x % ppg);
+		return uint16_t(val << shift);
+	}
+
+	static RGB16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  size_t W, size_t H) noexcept
+	{
+		const bool x_even = (x & 1) == 0;
+		const bool y_even = (y & 1) == 0;
+		const C self = detail::bayer_pick(Order, x_even, y_even);
+
+		const size_t xL = detail::clamp_coord(int(x) - 1, W);
+		const size_t xR = detail::clamp_coord(int(x) + 1, W);
+		const size_t yT = detail::clamp_coord(int(y) - 1, H);
+		const size_t yB = detail::clamp_coord(int(y) + 1, H);
+
+		const uint16_t s = read_sample(buf, x, y);
+
+		uint16_t r = 0, g = 0, b = 0;
+
+		if (self == C::G) {
+			const C h_color = detail::bayer_pick(Order, !x_even, y_even);
+			const uint16_t h_avg = uint16_t(
+				(uint32_t(read_sample(buf, xL, y))
+				 + read_sample(buf, xR, y) + 1u) >> 1);
+			const uint16_t v_avg = uint16_t(
+				(uint32_t(read_sample(buf, x, yT))
+				 + read_sample(buf, x, yB) + 1u) >> 1);
+			g = s;
+			if (h_color == C::R) { r = h_avg; b = v_avg; }
+			else                 { b = h_avg; r = v_avg; }
+		} else {
+			const uint16_t g_avg = uint16_t(
+				(uint32_t(read_sample(buf, x,  yT))
+				 + read_sample(buf, x,  yB)
+				 + read_sample(buf, xL, y)
+				 + read_sample(buf, xR, y) + 2u) >> 2);
+			const uint16_t o_avg = uint16_t(
+				(uint32_t(read_sample(buf, xL, yT))
+				 + read_sample(buf, xR, yT)
+				 + read_sample(buf, xL, yB)
+				 + read_sample(buf, xR, yB) + 2u) >> 2);
+			g = g_avg;
+			if (self == C::R) { r = s;     b = o_avg; }
+			else              { b = s;     r = o_avg; }
+		}
+
+		return RGB16{ r, g, b, uint16_t(0) };
+	}
+};
+
+template <typename L, BayerOrder Order, size_t BitDepth>
+struct BayerPackedSink {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+	static_assert(BitDepth == 10 || BitDepth == 12);
+
+	using Traits = detail::csi2::packed_traits<BitDepth>;
+	static constexpr size_t ppg = Traits::ppg;
+	static constexpr size_t bpg = Traits::bpg;
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = ppg;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const RGB16 (&block)[1][ppg]) noexcept
+	{
+		std::array<uint16_t, ppg> vals{};
+		for (size_t i = 0; i < ppg; ++i) {
+			const C pick = detail::bayer_pick(
+				Order, ((bx + i) & 1) == 0, (by & 1) == 0);
+			const uint16_t norm =
+				pick == C::R ? block[0][i].r
+				: pick == C::G ? block[0][i].g
+				: block[0][i].b;
+			vals[i] = uint16_t(norm >> (16 - BitDepth));
+		}
+
+		uint8_t* dst = buf.data[0] + by * buf.stride[0]
+		               + (bx / ppg) * bpg;
+		detail::csi2::pack_group<BitDepth>(dst, vals);
+	}
+};
+
+template <typename L> using BayerPackedSource_RGGB10 = BayerPackedSource<L, BayerOrder::RGGB, 10>;
+template <typename L> using BayerPackedSource_BGGR10 = BayerPackedSource<L, BayerOrder::BGGR, 10>;
+template <typename L> using BayerPackedSource_GRBG10 = BayerPackedSource<L, BayerOrder::GRBG, 10>;
+template <typename L> using BayerPackedSource_GBRG10 = BayerPackedSource<L, BayerOrder::GBRG, 10>;
+template <typename L> using BayerPackedSource_RGGB12 = BayerPackedSource<L, BayerOrder::RGGB, 12>;
+template <typename L> using BayerPackedSource_BGGR12 = BayerPackedSource<L, BayerOrder::BGGR, 12>;
+template <typename L> using BayerPackedSource_GRBG12 = BayerPackedSource<L, BayerOrder::GRBG, 12>;
+template <typename L> using BayerPackedSource_GBRG12 = BayerPackedSource<L, BayerOrder::GBRG, 12>;
+
+template <typename L> using BayerPackedSink_RGGB10 = BayerPackedSink<L, BayerOrder::RGGB, 10>;
+template <typename L> using BayerPackedSink_BGGR10 = BayerPackedSink<L, BayerOrder::BGGR, 10>;
+template <typename L> using BayerPackedSink_GRBG10 = BayerPackedSink<L, BayerOrder::GRBG, 10>;
+template <typename L> using BayerPackedSink_GBRG10 = BayerPackedSink<L, BayerOrder::GBRG, 10>;
+template <typename L> using BayerPackedSink_RGGB12 = BayerPackedSink<L, BayerOrder::RGGB, 12>;
+template <typename L> using BayerPackedSink_BGGR12 = BayerPackedSink<L, BayerOrder::BGGR, 12>;
+template <typename L> using BayerPackedSink_GRBG12 = BayerPackedSink<L, BayerOrder::GRBG, 12>;
+template <typename L> using BayerPackedSink_GBRG12 = BayerPackedSink<L, BayerOrder::GBRG, 12>;
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/csi2.h b/subprojects/pixpat/pixpat-native/src/io/csi2.h
new file mode 100644
index 0000000..59a8f8d
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/csi2.h
@@ -0,0 +1,80 @@
+#pragma once
+
+// Shared MIPI CSI-2 byte (un)packing for the 10P / 12P forms used by
+// Bayer raw and Y-only grayscale.
+//
+//   10P: 4 samples in 5 bytes — bytes 0..3 hold the high 8 bits of
+//        samples 0..3; byte 4 holds 4 x 2 LSBs (sample 0 in bits 6..7,
+//        sample 1 in bits 4..5, ...).
+//   12P: 2 samples in 3 bytes — bytes 0..1 hold the high 8 bits of
+//        samples 0..1; byte 2 holds 2 x 4 LSBs (sample 0 in bits 4..7,
+//        sample 1 in bits 0..3).
+//
+// Helpers deal in the stored integer (low BitDepth bits set);
+// normalization to/from the 16-bit pivot stays in the caller.
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+namespace pixpat::detail::csi2
+{
+
+template <size_t BitDepth>
+struct packed_traits;
+
+template <>
+struct packed_traits<10> {
+	static constexpr size_t ppg = 4;
+	static constexpr size_t bpg = 5;
+};
+
+template <>
+struct packed_traits<12> {
+	static constexpr size_t ppg = 2;
+	static constexpr size_t bpg = 3;
+};
+
+// Extract one BitDepth-bit sample from a packed group, where `i` is the
+// in-group index (0..ppg-1). The returned value occupies the low
+// BitDepth bits.
+template <size_t BitDepth>
+inline uint16_t unpack_sample(const uint8_t* src, size_t i) noexcept
+{
+	if constexpr (BitDepth == 10) {
+		const uint8_t hi  = src[i];
+		const uint8_t lsb = (src[4] >> ((3 - i) * 2)) & 0x03;
+		return uint16_t((hi << 2) | lsb);
+	} else { // 12
+		const uint8_t hi  = src[i];
+		const uint8_t lsb = (i == 0) ? ((src[2] >> 4) & 0x0F)
+		                             :  (src[2]       & 0x0F);
+		return uint16_t((hi << 4) | lsb);
+	}
+}
+
+// Write `ppg` BitDepth-bit samples (low BitDepth bits significant) into
+// a packed group of `bpg` bytes.
+template <size_t BitDepth>
+inline void pack_group(
+	uint8_t* dst,
+	const std::array<uint16_t, packed_traits<BitDepth>::ppg>& vals) noexcept
+{
+	if constexpr (BitDepth == 10) {
+		dst[0] = (vals[0] >> 2) & 0xFF;
+		dst[1] = (vals[1] >> 2) & 0xFF;
+		dst[2] = (vals[2] >> 2) & 0xFF;
+		dst[3] = (vals[3] >> 2) & 0xFF;
+		dst[4] = ((vals[0] & 0x03) << 6)
+		         | ((vals[1] & 0x03) << 4)
+		         | ((vals[2] & 0x03) << 2)
+		         | ((vals[3] & 0x03) << 0);
+	} else { // 12
+		dst[0] = (vals[0] >> 4) & 0xFF;
+		dst[1] = (vals[1] >> 4) & 0xFF;
+		dst[2] = ((vals[0] & 0x0F) << 4)
+		         | ((vals[1] & 0x0F) << 0);
+	}
+}
+
+} // namespace pixpat::detail::csi2
diff --git a/subprojects/pixpat/pixpat-native/src/io/detail.h b/subprojects/pixpat/pixpat-native/src/io/detail.h
new file mode 100644
index 0000000..cb2b9fb
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/detail.h
@@ -0,0 +1,62 @@
+#pragma once
+
+// Per-component encode/decode against the descriptor + memcpy-based
+// load/store_word helpers. Shared by every Source / Sink template.
+
+#include <cstdint>
+#include <cstring>
+
+#include "../layout.h"
+
+namespace pixpat::detail
+{
+
+// Decode an N-bit stored value into the 16-bit normalized space and
+// encode it back. Decode bit-replicates the stored value across the 16
+// bits so that N-bit max maps to normalized max (e.g. 8-bit 0xFF →
+// 0xFFFF, not 0xFF00). Encode is a plain truncating right-shift: the
+// replicated bits land in the low (16-N) bits and get dropped, so
+// stored→norm→stored is exact for any N in [1, 16].
+//
+// `bits` is taken at runtime; in every call site it traces back to a
+// constexpr Plane::comps[I].bits read, which the optimizer constant-
+// folds after inlining.
+
+constexpr uint16_t decode_norm(unsigned bits, uint16_t stored) noexcept
+{
+	const int N = int(bits);
+	// Loop, not a single OR: one replication only covers 2N bits, so
+	// N < 8 (RGB565, RGBA4444, 1-bit alpha, ...) needs multiple tiles.
+	uint32_t result = 0;
+	for (int s = 16 - N; s > -N; s -= N) {
+		if (s >= 0)
+			result |= uint32_t(stored) << s;
+		else
+			result |= uint32_t(stored) >> -s;
+	}
+	return uint16_t(result);
+}
+
+constexpr uint16_t encode_norm(unsigned bits, uint16_t norm) noexcept
+{
+	return uint16_t(norm >> (16u - bits));
+}
+
+// Read one storage word from `p`. memcpy is uniform for tight and
+// non-tight (e.g. BGR888 24-bit) layouts; the optimizer folds it to a
+// single load when the size is constant.
+template <typename Plane>
+inline typename Plane::storage_t load_word(const uint8_t* p) noexcept
+{
+	typename Plane::storage_t word{};
+	std::memcpy(&word, p, Plane::bytes_per_pixel);
+	return word;
+}
+
+template <typename Plane>
+inline void store_word(uint8_t* p, typename Plane::storage_t word) noexcept
+{
+	std::memcpy(p, &word, Plane::bytes_per_pixel);
+}
+
+} // namespace pixpat::detail
diff --git a/subprojects/pixpat/pixpat-native/src/io/gray.h b/subprojects/pixpat/pixpat-native/src/io/gray.h
new file mode 100644
index 0000000..d175b68
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/gray.h
@@ -0,0 +1,153 @@
+#pragma once
+
+// Grayscale (Y8 / Y10 / Y12 / Y16) and multi-pixel-per-word grayscale
+// (XYYY2101010: 3 Y components in one uint32_t). Modeled as a YUV format
+// with neutral chroma synthesized on read so cross-color-kind ColorXfm
+// produces R=G=B=Y'. The sink encodes Y from YUV16 and ignores U/V.
+// Y10/Y12 carry an X padding bitfield which we zero out on write.
+// Neutral chroma in normalized-16 is 0x8000 (the midpoint of [0, 0xFFFF]).
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct GraySource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y_idx = P::template find_pos<C::Y>();
+	static_assert(y_idx < P::num_comps);
+
+	static YUV16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* p = buf.data[0] + y * buf.stride[0]
+		                   + x * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+		return YUV16{
+		        detail::decode_norm(P::comps[y_idx].bits, vals[y_idx]),
+		        0x8000, 0x8000, uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct GraySink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y_idx = P::template find_pos<C::Y>();
+	static constexpr size_t x_idx = P::template find_pos<C::X>();
+	static constexpr bool has_x = (x_idx < P::num_comps);
+	static_assert(y_idx < P::num_comps);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = 1;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[1][1]) noexcept
+	{
+		std::array<uint16_t, P::num_comps> v{};
+		v[y_idx] = detail::encode_norm(P::comps[y_idx].bits, block[0][0].y);
+		if constexpr (has_x)
+			v[x_idx] = 0;
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0]
+		             + bx * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+// Multi-pixel-per-word grayscale. The Layout carries one C::Y entry per
+// pixel in the group; pixels_per_word is derived from how many C::Y
+// entries the layout has. All Y components must share the same bit width
+// (so the encode/decode shift is shared). block_w = ppw so the sink
+// writes one storage word per block.
+template <typename L>
+struct MultiPixelGraySource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t ppw = P::template component_count<C::Y>();
+	static_assert(ppw >= 1);
+
+	// All Y positions share the same bit width.
+	static constexpr unsigned y_bits = P::comps[P::template find_pos<C::Y>(0)].bits;
+
+	static YUV16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const size_t gx  = x / ppw;
+		const size_t off = x % ppw;
+		const uint8_t* p = buf.data[0] + y * buf.stride[0]
+		                   + gx * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+
+		// find_pos walks the comps array at runtime; comps is constexpr
+		// and num_comps is small (≤4 for these formats), so it inlines.
+		const size_t y_pos = P::template find_pos<C::Y>(off);
+
+		return YUV16{
+		        detail::decode_norm(y_bits, vals[y_pos]),
+		        0x8000, 0x8000, uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct MultiPixelGraySink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t ppw = P::template component_count<C::Y>();
+	static constexpr size_t x_idx = P::template find_pos<C::X>();
+	static constexpr bool has_x = (x_idx < P::num_comps);
+	static_assert(ppw >= 1);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = ppw;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[1][ppw]) noexcept
+	{
+		std::array<uint16_t, P::num_comps> v{};
+		// All Y slots share the same bit width.
+		constexpr unsigned y_bits = P::comps[P::template find_pos<C::Y>(0)].bits;
+		for (size_t i = 0; i < ppw; ++i) {
+			const size_t pos = P::template find_pos<C::Y>(i);
+			v[pos] = detail::encode_norm(y_bits, block[0][i].y);
+		}
+
+		if constexpr (has_x)
+			v[x_idx] = 0;
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0]
+		             + (bx / ppw) * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/gray_packed.h b/subprojects/pixpat/pixpat-native/src/io/gray_packed.h
new file mode 100644
index 0000000..dc1fa68
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/gray_packed.h
@@ -0,0 +1,78 @@
+#pragma once
+
+// MIPI CSI-2 packed grayscale (Y10P / Y12P). Same byte packing as
+// Bayer10P/Bayer12P (see io/csi2.h) but every sample is Y; the source
+// emits neutral chroma to keep cross-color-kind ColorXfm consistent
+// with GraySource.
+//
+// The Layout slot is a placeholder (matches the unpacked Y8 storage
+// shape so dispatch plumbing is uniform); bytes_per_pixel from the
+// Plane is unused.
+
+#include <array>
+#include <cstdint>
+
+#include "../layout.h"
+#include "csi2.h"
+
+namespace pixpat
+{
+
+template <typename L, size_t BitDepth>
+struct GrayPackedSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+	static_assert(BitDepth == 10 || BitDepth == 12);
+
+	using Traits = detail::csi2::packed_traits<BitDepth>;
+	static constexpr size_t ppg = Traits::ppg;
+	static constexpr size_t bpg = Traits::bpg;
+	static constexpr unsigned shift = 16 - BitDepth;
+
+	static YUV16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* src = buf.data[0] + y * buf.stride[0]
+		                     + (x / ppg) * bpg;
+		const uint16_t val = detail::csi2::unpack_sample<BitDepth>(src, x % ppg);
+		return YUV16{
+		        uint16_t(val << shift),
+		        0x8000, 0x8000, uint16_t(0),
+		};
+	}
+};
+
+template <typename L, size_t BitDepth>
+struct GrayPackedSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+	static_assert(BitDepth == 10 || BitDepth == 12);
+
+	using Traits = detail::csi2::packed_traits<BitDepth>;
+	static constexpr size_t ppg = Traits::ppg;
+	static constexpr size_t bpg = Traits::bpg;
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = ppg;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[1][ppg]) noexcept
+	{
+		std::array<uint16_t, ppg> vals{};
+		for (size_t i = 0; i < ppg; ++i)
+			vals[i] = uint16_t(block[0][i].y >> (16 - BitDepth));
+
+		uint8_t* dst = buf.data[0] + by * buf.stride[0]
+		               + (bx / ppg) * bpg;
+		detail::csi2::pack_group<BitDepth>(dst, vals);
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/mono_rgb.h b/subprojects/pixpat/pixpat-native/src/io/mono_rgb.h
new file mode 100644
index 0000000..f2f8206
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/mono_rgb.h
@@ -0,0 +1,72 @@
+#pragma once
+
+// Single-channel RGB formats (R8). Storage carries one R component;
+// MonoRGBSource synthesizes G=B=R on read so cross-color-kind ColorXfm
+// produces sensible Y from R alone. MonoRGBSink encodes R and ignores
+// G/B/A (and zeroes any X padding). Symmetric to GraySource/GraySink
+// (io/gray.h) but for ColorKind::RGB on C::R.
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct MonoRGBSource {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t r_idx = P::template find_pos<C::R>();
+	static_assert(r_idx < P::num_comps);
+
+	static RGB16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* p = buf.data[0] + y * buf.stride[0]
+		                   + x * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+		const uint16_t r = detail::decode_norm(P::comps[r_idx].bits, vals[r_idx]);
+		return RGB16{ r, r, r, uint16_t(0) };
+	}
+};
+
+template <typename L>
+struct MonoRGBSink {
+	using Layout = L;
+	using Pixel  = RGB16;
+
+	static_assert(L::kind == ColorKind::RGB);
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t r_idx = P::template find_pos<C::R>();
+	static constexpr size_t x_idx = P::template find_pos<C::X>();
+	static constexpr bool has_x = (x_idx < P::num_comps);
+	static_assert(r_idx < P::num_comps);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = 1;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const RGB16 (&block)[1][1]) noexcept
+	{
+		std::array<uint16_t, P::num_comps> v{};
+		v[r_idx] = detail::encode_norm(P::comps[r_idx].bits, block[0][0].r);
+		if constexpr (has_x)
+			v[x_idx] = 0;
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0]
+		             + bx * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/packed.h b/subprojects/pixpat/pixpat-native/src/io/packed.h
new file mode 100644
index 0000000..9d953bc
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/packed.h
@@ -0,0 +1,106 @@
+#pragma once
+
+// Single-plane, single-pixel-per-storage-word formats. Works for both
+// RGB layouts (XRGB8888, RGB565, ABGR16161616, ...) and YUV
+// single-pixel layouts (XVUY2101010, AVUY16161616). Pixel type follows
+// L::kind; the three mandatory components are R/G/B for RGB or Y/U/V
+// for YUV. Both `RGB16` and `YUV16` are 4 uint16_t with the alpha last,
+// so aggregate-init by position works for either.
+
+#include <array>
+#include <type_traits>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct PackedSource {
+	using Layout = L;
+	using Pixel  = std::conditional_t<L::kind == ColorKind::RGB, RGB16, YUV16>;
+
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr C c0 = (L::kind == ColorKind::RGB) ? C::R : C::Y;
+	static constexpr C c1 = (L::kind == ColorKind::RGB) ? C::G : C::U;
+	static constexpr C c2 = (L::kind == ColorKind::RGB) ? C::B : C::V;
+
+	static constexpr size_t i0 = P::template find_pos<c0>();
+	static constexpr size_t i1 = P::template find_pos<c1>();
+	static constexpr size_t i2 = P::template find_pos<c2>();
+	static constexpr size_t a_idx = P::template find_pos<C::A>();
+	static constexpr bool has_a = (a_idx < P::num_comps);
+	static_assert(i0 < P::num_comps && i1 < P::num_comps && i2 < P::num_comps);
+
+	static Pixel read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* p = buf.data[0] + y * buf.stride[0] + x * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+		Pixel out{
+			detail::decode_norm(P::comps[i0].bits, vals[i0]),
+			detail::decode_norm(P::comps[i1].bits, vals[i1]),
+			detail::decode_norm(P::comps[i2].bits, vals[i2]),
+			uint16_t(0),
+		};
+		if constexpr (has_a)
+			out.a = detail::decode_norm(P::comps[a_idx].bits, vals[a_idx]);
+		return out;
+	}
+};
+
+template <typename L>
+struct PackedSink {
+	using Layout = L;
+	using Pixel  = std::conditional_t<L::kind == ColorKind::RGB, RGB16, YUV16>;
+
+	static_assert(L::num_planes == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr C c0 = (L::kind == ColorKind::RGB) ? C::R : C::Y;
+	static constexpr C c1 = (L::kind == ColorKind::RGB) ? C::G : C::U;
+	static constexpr C c2 = (L::kind == ColorKind::RGB) ? C::B : C::V;
+
+	static constexpr size_t i0 = P::template find_pos<c0>();
+	static constexpr size_t i1 = P::template find_pos<c1>();
+	static constexpr size_t i2 = P::template find_pos<c2>();
+	static constexpr size_t x_idx = P::template find_pos<C::X>();
+	static constexpr size_t a_idx = P::template find_pos<C::A>();
+	static constexpr bool has_x = (x_idx < P::num_comps);
+	static constexpr bool has_a = (a_idx < P::num_comps);
+	static_assert(i0 < P::num_comps && i1 < P::num_comps && i2 < P::num_comps);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = 1;
+
+	// Aggregate-init access to RGB16/YUV16 by position: .r/.y, .g/.u, .b/.v.
+	// We use the field names corresponding to L::kind.
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const Pixel (&block)[1][1]) noexcept
+	{
+		const Pixel& pix = block[0][0];
+		std::array<uint16_t, P::num_comps> v{};
+		if constexpr (L::kind == ColorKind::RGB) {
+			v[i0] = detail::encode_norm(P::comps[i0].bits, pix.r);
+			v[i1] = detail::encode_norm(P::comps[i1].bits, pix.g);
+			v[i2] = detail::encode_norm(P::comps[i2].bits, pix.b);
+		} else {
+			v[i0] = detail::encode_norm(P::comps[i0].bits, pix.y);
+			v[i1] = detail::encode_norm(P::comps[i1].bits, pix.u);
+			v[i2] = detail::encode_norm(P::comps[i2].bits, pix.v);
+		}
+		if constexpr (has_x)
+			v[x_idx] = 0;
+		if constexpr (has_a)
+			v[a_idx] = detail::encode_norm(P::comps[a_idx].bits, pix.a);
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0] + bx * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/packed_yuv.h b/subprojects/pixpat/pixpat-native/src/io/packed_yuv.h
new file mode 100644
index 0000000..90c8b2f
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/packed_yuv.h
@@ -0,0 +1,89 @@
+#pragma once
+
+// Packed YUV 4:2:2 (YUYV / YVYU / UYVY / VYUY): two pixels per 32-bit
+// word, one shared chroma pair. The Layout uses two C::Y entries plus
+// one each of C::U / C::V; we resolve the duplicate Y via
+// find_pos<C::Y>(n).
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct PackedYUVSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+	static_assert(L::h_sub == 2 && L::v_sub == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y0_idx = P::template find_pos<C::Y>(0);
+	static constexpr size_t y1_idx = P::template find_pos<C::Y>(1);
+	static constexpr size_t u_idx  = P::template find_pos<C::U>();
+	static constexpr size_t v_idx  = P::template find_pos<C::V>();
+
+	static YUV16 read(const Buffer<1>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* p = buf.data[0] + y * buf.stride[0]
+		                   + (x / 2) * P::bytes_per_pixel;
+		const auto vals = P::unpack(detail::load_word<P>(p));
+		const size_t y_pick = (x & 1) ? y1_idx : y0_idx;
+		// Both Y components share the same bit width, so the bit-width
+		// for y0 and y1 is identical — pick either.
+		return YUV16{
+		        detail::decode_norm(P::comps[y0_idx].bits, vals[y_pick]),
+		        detail::decode_norm(P::comps[u_idx].bits, vals[u_idx]),
+		        detail::decode_norm(P::comps[v_idx].bits, vals[v_idx]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct PackedYUVSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 1);
+	static_assert(L::h_sub == 2 && L::v_sub == 1);
+
+	using P = typename L::template plane<0>;
+	static constexpr size_t y0_idx = P::template find_pos<C::Y>(0);
+	static constexpr size_t y1_idx = P::template find_pos<C::Y>(1);
+	static constexpr size_t u_idx  = P::template find_pos<C::U>();
+	static constexpr size_t v_idx  = P::template find_pos<C::V>();
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = 2;
+
+	static void write_block(Buffer<1>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[1][2]) noexcept
+	{
+		std::array<uint16_t, P::num_comps> v{};
+		v[y0_idx] = detail::encode_norm(P::comps[y0_idx].bits, block[0][0].y);
+		v[y1_idx] = detail::encode_norm(P::comps[y1_idx].bits, block[0][1].y);
+		// Integer chroma averaging in normalized-16 space. Truncates
+		// (no round-half-up).
+		v[u_idx]  = detail::encode_norm(P::comps[u_idx].bits, uint16_t(
+							(uint32_t(block[0][0].u) +
+							 uint32_t(block[0][1].u)) / 2));
+		v[v_idx]  = detail::encode_norm(P::comps[v_idx].bits, uint16_t(
+							(uint32_t(block[0][0].v) +
+							 uint32_t(block[0][1].v)) / 2));
+
+		uint8_t* p = buf.data[0] + by * buf.stride[0]
+		             + (bx / 2) * P::bytes_per_pixel;
+		detail::store_word<P>(p, P::pack(v));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/planar.h b/subprojects/pixpat/pixpat-native/src/io/planar.h
new file mode 100644
index 0000000..0dab685
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/planar.h
@@ -0,0 +1,257 @@
+#pragma once
+
+// 3-plane planar YUV. Two flavours:
+//
+//   PlanarSource / PlanarSink — YUV/YVU 420/422/444, single Y per word,
+//     single chroma per word. Chroma is averaged over h_sub × v_sub
+//     on write.
+//
+//   MultiPixelPlanarSource / MultiPixelPlanarSink — T430, multi-pixel-
+//     per-word planar 4:4:4 (3 samples per uint32_t in each of 3
+//     planes, plus 2-bit X padding). block_w = ppw, block_h = 1.
+//
+// Plane indices for Y / U / V are looked up via Layout::find_plane<C>(),
+// so swap_uv layouts (YVU vs YUV) work without separate templates.
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct PlanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 3);
+
+	static constexpr size_t y_plane = L::template find_plane<C::Y>();
+	static constexpr size_t u_plane = L::template find_plane<C::U>();
+	static constexpr size_t v_plane = L::template find_plane<C::V>();
+
+	using YP = typename L::template plane<y_plane>;
+	using UP = typename L::template plane<u_plane>;
+	using VP = typename L::template plane<v_plane>;
+
+	static YUV16 read(const Buffer<3>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* yp = buf.data[y_plane] + y * buf.stride[y_plane]
+		                    + x * YP::bytes_per_pixel;
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+
+		const size_t cx = x / L::h_sub;
+		const size_t cy = y / L::v_sub;
+		const uint8_t* up = buf.data[u_plane] + cy * buf.stride[u_plane]
+		                    + cx * UP::bytes_per_pixel;
+		const uint8_t* vp = buf.data[v_plane] + cy * buf.stride[v_plane]
+		                    + cx * VP::bytes_per_pixel;
+		const auto u_vals = UP::unpack(detail::load_word<UP>(up));
+		const auto v_vals = VP::unpack(detail::load_word<VP>(vp));
+
+		return YUV16{
+		        detail::decode_norm(YP::comps[0].bits, y_vals[0]),
+		        detail::decode_norm(UP::comps[0].bits, u_vals[0]),
+		        detail::decode_norm(VP::comps[0].bits, v_vals[0]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct PlanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 3);
+
+	static constexpr size_t y_plane = L::template find_plane<C::Y>();
+	static constexpr size_t u_plane = L::template find_plane<C::U>();
+	static constexpr size_t v_plane = L::template find_plane<C::V>();
+
+	using YP = typename L::template plane<y_plane>;
+	using UP = typename L::template plane<u_plane>;
+	using VP = typename L::template plane<v_plane>;
+
+	static constexpr size_t block_h = L::v_sub;
+	static constexpr size_t block_w = L::h_sub;
+
+	static void write_block(Buffer<3>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[block_h][block_w]) noexcept
+	{
+		// Y per pixel.
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			uint8_t* y_row = buf.data[y_plane]
+			                 + (by + dy) * buf.stride[y_plane];
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				std::array<uint16_t, YP::num_comps> v{};
+				v[0] = detail::encode_norm(YP::comps[0].bits, block[dy][dx].y);
+				detail::store_word<YP>(
+					y_row + (bx + dx) * YP::bytes_per_pixel,
+					YP::pack(v));
+			}
+		}
+
+		// One averaged U and V sample per block. Integer truncation
+		// (no round-half-up).
+		uint32_t u_sum = 0, v_sum = 0;
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				u_sum += block[dy][dx].u;
+				v_sum += block[dy][dx].v;
+			}
+		}
+		constexpr uint32_t n = block_h * block_w;
+
+		const size_t cx = bx / L::h_sub;
+		const size_t cy = by / L::v_sub;
+
+		std::array<uint16_t, UP::num_comps> uw{};
+		uw[0] = detail::encode_norm(UP::comps[0].bits, uint16_t(u_sum / n));
+		detail::store_word<UP>(
+			buf.data[u_plane] + cy * buf.stride[u_plane]
+			+ cx * UP::bytes_per_pixel,
+			UP::pack(uw));
+
+		std::array<uint16_t, VP::num_comps> vw{};
+		vw[0] = detail::encode_norm(VP::comps[0].bits, uint16_t(v_sum / n));
+		detail::store_word<VP>(
+			buf.data[v_plane] + cy * buf.stride[v_plane]
+			+ cx * VP::bytes_per_pixel,
+			VP::pack(vw));
+	}
+};
+
+// T430-style 3-plane multi-pixel-per-word planar 4:4:4. Each plane has
+// `ppw` samples of the same component (Y in plane 0, U in 1, V in 2 —
+// or whichever ordering find_plane resolves) packed into a single
+// storage word. block_w = ppw, block_h = 1. No chroma subsampling.
+template <typename L>
+struct MultiPixelPlanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 3);
+	static_assert(L::h_sub == 1 && L::v_sub == 1);
+
+	static constexpr size_t y_plane = L::template find_plane<C::Y>();
+	static constexpr size_t u_plane = L::template find_plane<C::U>();
+	static constexpr size_t v_plane = L::template find_plane<C::V>();
+
+	using YP = typename L::template plane<y_plane>;
+	using UP = typename L::template plane<u_plane>;
+	using VP = typename L::template plane<v_plane>;
+
+	static constexpr size_t ppw = YP::template component_count<C::Y>();
+	static_assert(ppw == UP::template component_count<C::U>());
+	static_assert(ppw == VP::template component_count<C::V>());
+
+	// All same-tag positions share the same bit width.
+	static constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+	static constexpr unsigned u_bits = UP::comps[UP::template find_pos<C::U>(0)].bits;
+	static constexpr unsigned v_bits = VP::comps[VP::template find_pos<C::V>(0)].bits;
+
+	static YUV16 read(const Buffer<3>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const size_t gx  = x / ppw;
+		const size_t off = x % ppw;
+
+		const uint8_t* yp = buf.data[y_plane] + y * buf.stride[y_plane]
+		                    + gx * YP::bytes_per_pixel;
+		const uint8_t* up = buf.data[u_plane] + y * buf.stride[u_plane]
+		                    + gx * UP::bytes_per_pixel;
+		const uint8_t* vp = buf.data[v_plane] + y * buf.stride[v_plane]
+		                    + gx * VP::bytes_per_pixel;
+
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+		const auto u_vals = UP::unpack(detail::load_word<UP>(up));
+		const auto v_vals = VP::unpack(detail::load_word<VP>(vp));
+
+		return YUV16{
+		        detail::decode_norm(y_bits, y_vals[YP::template find_pos<C::Y>(off)]),
+		        detail::decode_norm(u_bits, u_vals[UP::template find_pos<C::U>(off)]),
+		        detail::decode_norm(v_bits, v_vals[VP::template find_pos<C::V>(off)]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct MultiPixelPlanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 3);
+	static_assert(L::h_sub == 1 && L::v_sub == 1);
+
+	static constexpr size_t y_plane = L::template find_plane<C::Y>();
+	static constexpr size_t u_plane = L::template find_plane<C::U>();
+	static constexpr size_t v_plane = L::template find_plane<C::V>();
+
+	using YP = typename L::template plane<y_plane>;
+	using UP = typename L::template plane<u_plane>;
+	using VP = typename L::template plane<v_plane>;
+
+	static constexpr size_t ppw = YP::template component_count<C::Y>();
+
+	static constexpr size_t y_x_idx = YP::template find_pos<C::X>();
+	static constexpr size_t u_x_idx = UP::template find_pos<C::X>();
+	static constexpr size_t v_x_idx = VP::template find_pos<C::X>();
+	static constexpr bool y_has_x = (y_x_idx < YP::num_comps);
+	static constexpr bool u_has_x = (u_x_idx < UP::num_comps);
+	static constexpr bool v_has_x = (v_x_idx < VP::num_comps);
+
+	static constexpr size_t block_h = 1;
+	static constexpr size_t block_w = ppw;
+
+	static void write_block(Buffer<3>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[1][ppw]) noexcept
+	{
+		std::array<uint16_t, YP::num_comps> yv{};
+		std::array<uint16_t, UP::num_comps> uv{};
+		std::array<uint16_t, VP::num_comps> vv{};
+
+		// All same-tag positions share the same bit width.
+		constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+		constexpr unsigned u_bits = UP::comps[UP::template find_pos<C::U>(0)].bits;
+		constexpr unsigned v_bits = VP::comps[VP::template find_pos<C::V>(0)].bits;
+		for (size_t i = 0; i < ppw; ++i) {
+			yv[YP::template find_pos<C::Y>(i)] =
+				detail::encode_norm(y_bits, block[0][i].y);
+			uv[UP::template find_pos<C::U>(i)] =
+				detail::encode_norm(u_bits, block[0][i].u);
+			vv[VP::template find_pos<C::V>(i)] =
+				detail::encode_norm(v_bits, block[0][i].v);
+		}
+
+		if constexpr (y_has_x) yv[y_x_idx] = 0;
+		if constexpr (u_has_x) uv[u_x_idx] = 0;
+		if constexpr (v_has_x) vv[v_x_idx] = 0;
+
+		const size_t gx = bx / ppw;
+		detail::store_word<YP>(
+			buf.data[y_plane] + by * buf.stride[y_plane]
+			+ gx * YP::bytes_per_pixel,
+			YP::pack(yv));
+		detail::store_word<UP>(
+			buf.data[u_plane] + by * buf.stride[u_plane]
+			+ gx * UP::bytes_per_pixel,
+			UP::pack(uv));
+		detail::store_word<VP>(
+			buf.data[v_plane] + by * buf.stride[v_plane]
+			+ gx * VP::bytes_per_pixel,
+			VP::pack(vv));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/io/semiplanar.h b/subprojects/pixpat/pixpat-native/src/io/semiplanar.h
new file mode 100644
index 0000000..00e7731
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/semiplanar.h
@@ -0,0 +1,242 @@
+#pragma once
+
+// 2-plane semiplanar YUV. Two flavours:
+//
+//   SemiplanarSource / SemiplanarSink — NV12/NV21/NV16/NV61, single
+//     pixel per Y storage word, single chroma pair per chroma word.
+//
+//   MultiPixelSemiplanarSource / MultiPixelSemiplanarSink — P030/P230,
+//     multiple Y pixels per Y word and multiple chroma pairs per
+//     chroma word. The Y plane has `ppw_y = component_count<Y>()` Y
+//     samples per storage word; the chroma plane has `pairs =
+//     component_count<U>()` U/V pairs per storage word. block_w =
+//     pairs × h_sub, block_h = v_sub — each block exactly fills one
+//     chroma word.
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct SemiplanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t y_idx = YP::template find_pos<C::Y>();
+	static constexpr size_t u_idx = CP::template find_pos<C::U>();
+	static constexpr size_t v_idx = CP::template find_pos<C::V>();
+
+	static YUV16 read(const Buffer<2>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* yp = buf.data[0] + y * buf.stride[0] + x * YP::bytes_per_pixel;
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+
+		const size_t cx = x / L::h_sub;
+		const size_t cy = y / L::v_sub;
+		const uint8_t* cp = buf.data[1] + cy * buf.stride[1] + cx * CP::bytes_per_pixel;
+		const auto c_vals = CP::unpack(detail::load_word<CP>(cp));
+
+		return YUV16{
+		        detail::decode_norm(YP::comps[y_idx].bits, y_vals[y_idx]),
+		        detail::decode_norm(CP::comps[u_idx].bits, c_vals[u_idx]),
+		        detail::decode_norm(CP::comps[v_idx].bits, c_vals[v_idx]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct SemiplanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t y_idx = YP::template find_pos<C::Y>();
+	static constexpr size_t u_idx = CP::template find_pos<C::U>();
+	static constexpr size_t v_idx = CP::template find_pos<C::V>();
+
+	static constexpr size_t block_h = L::v_sub;
+	static constexpr size_t block_w = L::h_sub;
+
+	static void write_block(Buffer<2>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[block_h][block_w]) noexcept
+	{
+		// Y per pixel.
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			uint8_t* y_row = buf.data[0] + (by + dy) * buf.stride[0];
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				std::array<uint16_t, YP::num_comps> v{};
+				v[y_idx] = detail::encode_norm(YP::comps[y_idx].bits,
+				                               block[dy][dx].y);
+				detail::store_word<YP>(
+					y_row + (bx + dx) * YP::bytes_per_pixel,
+					YP::pack(v));
+			}
+		}
+
+		// One averaged UV pair for the whole block. Integer truncation
+		// (no round-half-up).
+		uint32_t u_sum = 0, v_sum = 0;
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				u_sum += block[dy][dx].u;
+				v_sum += block[dy][dx].v;
+			}
+		}
+		constexpr uint32_t n = block_h * block_w;
+		const uint16_t u_avg = uint16_t(u_sum / n);
+		const uint16_t v_avg = uint16_t(v_sum / n);
+
+		std::array<uint16_t, CP::num_comps> uv{};
+		uv[u_idx] = detail::encode_norm(CP::comps[u_idx].bits, u_avg);
+		uv[v_idx] = detail::encode_norm(CP::comps[v_idx].bits, v_avg);
+
+		const size_t cx = bx / L::h_sub;
+		const size_t cy = by / L::v_sub;
+		uint8_t* cp = buf.data[1] + cy * buf.stride[1] + cx * CP::bytes_per_pixel;
+		detail::store_word<CP>(cp, CP::pack(uv));
+	}
+};
+
+// Multi-pixel-per-word semiplanar (P030: 4:2:0, P230: 4:2:2). All Y
+// components share the same bit width; same for U and V.
+template <typename L>
+struct MultiPixelSemiplanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t ppw_y = YP::template component_count<C::Y>();
+	static constexpr size_t pairs = CP::template component_count<C::U>();
+	static_assert(ppw_y >= 1 && pairs >= 1);
+	static_assert(pairs == CP::template component_count<C::V>());
+
+	// All same-tag positions share the same bit width.
+	static constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+	static constexpr unsigned u_bits = CP::comps[CP::template find_pos<C::U>(0)].bits;
+	static constexpr unsigned v_bits = CP::comps[CP::template find_pos<C::V>(0)].bits;
+
+	static YUV16 read(const Buffer<2>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		// Y read.
+		const size_t y_gx  = x / ppw_y;
+		const size_t y_off = x % ppw_y;
+		const uint8_t* yp = buf.data[0] + y * buf.stride[0]
+		                    + y_gx * YP::bytes_per_pixel;
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+
+		// Chroma read.
+		const size_t cx    = x / L::h_sub;
+		const size_t cy    = y / L::v_sub;
+		const size_t c_gx  = cx / pairs;
+		const size_t c_off = cx % pairs;
+		const uint8_t* cp = buf.data[1] + cy * buf.stride[1]
+		                    + c_gx * CP::bytes_per_pixel;
+		const auto c_vals = CP::unpack(detail::load_word<CP>(cp));
+
+		return YUV16{
+		        detail::decode_norm(y_bits, y_vals[YP::template find_pos<C::Y>(y_off)]),
+		        detail::decode_norm(u_bits, c_vals[CP::template find_pos<C::U>(c_off)]),
+		        detail::decode_norm(v_bits, c_vals[CP::template find_pos<C::V>(c_off)]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct MultiPixelSemiplanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t ppw_y = YP::template component_count<C::Y>();
+	static constexpr size_t pairs = CP::template component_count<C::U>();
+	static_assert(ppw_y >= 1 && pairs >= 1);
+
+	// One block exactly fills one chroma word: `pairs` chroma pairs,
+	// each covering h_sub luma columns × v_sub rows.
+	static constexpr size_t block_w = pairs * L::h_sub;
+	static constexpr size_t block_h = L::v_sub;
+	static_assert(block_w % ppw_y == 0,
+	              "block width must be a multiple of Y-pixels-per-word");
+	static constexpr size_t y_words_per_row = block_w / ppw_y;
+
+	// All same-tag positions share the same bit width.
+	static constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+	static constexpr unsigned u_bits = CP::comps[CP::template find_pos<C::U>(0)].bits;
+	static constexpr unsigned v_bits = CP::comps[CP::template find_pos<C::V>(0)].bits;
+
+	static void write_block(Buffer<2>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[block_h][block_w]) noexcept
+	{
+		// Y plane: y_words_per_row Y-words per row, block_h rows.
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			uint8_t* y_row = buf.data[0]
+			                 + (by + dy) * buf.stride[0];
+			for (size_t w = 0; w < y_words_per_row; ++w) {
+				std::array<uint16_t, YP::num_comps> v{};
+				for (size_t i = 0; i < ppw_y; ++i) {
+					const size_t pos = YP::template find_pos<C::Y>(i);
+					v[pos] = detail::encode_norm(
+						y_bits, block[dy][w * ppw_y + i].y);
+				}
+				detail::store_word<YP>(
+					y_row + (bx / ppw_y + w)
+					* YP::bytes_per_pixel,
+					YP::pack(v));
+			}
+		}
+
+		// One UV-word: `pairs` chroma pairs. Each pair averages h_sub
+		// horizontally × v_sub vertically luma values.
+		std::array<uint16_t, CP::num_comps> uv{};
+		constexpr uint32_t n = L::h_sub * L::v_sub;
+		for (size_t p = 0; p < pairs; ++p) {
+			uint32_t u_sum = 0, v_sum = 0;
+			for (size_t dy = 0; dy < block_h; ++dy) {
+				for (size_t dx = 0; dx < L::h_sub; ++dx) {
+					u_sum += block[dy][p * L::h_sub + dx].u;
+					v_sum += block[dy][p * L::h_sub + dx].v;
+				}
+			}
+			uv[CP::template find_pos<C::U>(p)] =
+				detail::encode_norm(u_bits, uint16_t(u_sum / n));
+			uv[CP::template find_pos<C::V>(p)] =
+				detail::encode_norm(v_bits, uint16_t(v_sum / n));
+		}
+
+		const size_t cy = by / L::v_sub;
+		const size_t uv_word_idx = bx / block_w;
+		detail::store_word<CP>(
+			buf.data[1] + cy * buf.stride[1]
+			+ uv_word_idx * CP::bytes_per_pixel,
+			CP::pack(uv));
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/layout.h b/subprojects/pixpat/pixpat-native/src/layout.h
new file mode 100644
index 0000000..d092bb1
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/layout.h
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace pixpat
+{
+
+enum class ColorKind { RGB, YUV };
+
+// Normalized inter-stage pixel types. 16-bit-per-component integer.
+// N-bit storage values bit-replicate up to normalized 16-bit (so
+// N-bit max maps to 0xFFFF); encoding back is a truncating
+// `normalized >> (16 - N)`. See io/detail.h for the round-trip
+// argument. Sources without an A component emit a=0; cross-color-kind
+// ColorXfm resets a=0xFFFF; sinks with X write 0, sinks with A
+// encode `a`.
+struct RGB16 {
+	static constexpr ColorKind kind = ColorKind::RGB;
+	uint16_t r, g, b, a;
+};
+
+struct YUV16 {
+	static constexpr ColorKind kind = ColorKind::YUV;
+	uint16_t y, u, v, a;
+};
+
+inline constexpr uint16_t kNormMax = 0xFFFF;
+
+enum class C : uint8_t { X, A, R, G, B, Y, U, V };
+
+struct Comp {
+	C c;
+	uint8_t bits;
+	uint8_t shift;
+};
+
+template <typename Storage, Comp... Cs>
+struct Plane {
+	using storage_t = Storage;
+
+	static constexpr size_t num_comps = sizeof...(Cs);
+	static constexpr std::array<Comp, num_comps> comps{ Cs ... };
+	static constexpr size_t total_bits = (size_t(Cs.bits) + ... + 0);
+	static constexpr size_t storage_bits = sizeof(Storage) * 8;
+	static constexpr size_t bytes_per_pixel = (total_bits + 7) / 8;
+
+	static_assert(total_bits <= storage_bits, "components overflow storage word");
+
+	// Index of the n-th component matching Tag, or num_comps if absent.
+	template <C Tag>
+	static constexpr size_t find_pos(size_t n = 0)
+	{
+		for (size_t i = 0; i < num_comps; ++i) {
+			if (comps[i].c == Tag) {
+				if (n == 0)
+					return i;
+				--n;
+			}
+		}
+		return num_comps;
+	}
+
+	// Count of components matching Tag. Used to derive
+	// pixels_per_word for multi-pixel-per-storage formats (XYYY2101010,
+	// P030, ...).
+	template <C Tag>
+	static constexpr size_t component_count()
+	{
+		size_t cnt = 0;
+		for (size_t i = 0; i < num_comps; ++i)
+			if (comps[i].c == Tag)
+				++cnt;
+		return cnt;
+	}
+
+	// Mask each input value to its bit-width and OR-shift it into the
+	// storage word. The loop trip count and the comps[i] reads are
+	// compile-time constant, so the optimizer unrolls and folds.
+	static constexpr Storage pack(const std::array<uint16_t, num_comps>& v) noexcept
+	{
+		Storage out{};
+		for (size_t i = 0; i < num_comps; ++i) {
+			const Storage mask = (Storage{ 1 } << comps[i].bits) - 1;
+			out |= Storage(v[i] & mask) << comps[i].shift;
+		}
+		return out;
+	}
+
+	// Mirror of `pack`.
+	static constexpr std::array<uint16_t, num_comps> unpack(Storage word) noexcept
+	{
+		std::array<uint16_t, num_comps> out{};
+		for (size_t i = 0; i < num_comps; ++i) {
+			const Storage mask = (Storage{ 1 } << comps[i].bits) - 1;
+			out[i] = uint16_t((word >> comps[i].shift) & mask);
+		}
+		return out;
+	}
+};
+
+template <ColorKind Kind, size_t Hsub, size_t Vsub, typename ... Planes>
+struct Layout {
+	static constexpr ColorKind kind = Kind;
+	static constexpr size_t h_sub = Hsub;
+	static constexpr size_t v_sub = Vsub;
+	static constexpr size_t num_planes = sizeof...(Planes);
+
+	template <size_t N>
+	using plane = std::tuple_element_t<N, std::tuple<Planes...> >;
+
+	// Index of the first plane containing component Tag, or num_planes
+	// if no plane has it. Lets PlanarSource/Sink map C::U / C::V to a
+	// plane regardless of YUV vs YVU ordering.
+	// Comma-fold over plane indices: for each plane I check if it has
+	// Tag, and on the first hit assign `found = I`. Subsequent hits are
+	// suppressed by the `found == num_planes` guard. The whole fold
+	// evaluates to a discarded list of int 0s; the `found` capture
+	// carries the result out.
+	template <C Tag>
+	static constexpr size_t find_plane()
+	{
+		return [&]<size_t... I>(std::index_sequence<I...>) {
+			       size_t found = num_planes;
+			       ((plane<I>::template find_pos<Tag>() < plane<I>::num_comps
+			         ? (found == num_planes ? (found = I, 0) : 0)
+			         : 0), ...);
+			       return found;
+		} (std::make_index_sequence<num_planes>{});
+	}
+};
+
+template <size_t N>
+struct Buffer {
+	std::array<uint8_t*, N> data;
+	std::array<size_t,   N> stride;
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/params.h b/subprojects/pixpat/pixpat-native/src/params.h
new file mode 100644
index 0000000..aa2be67
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/params.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "layout.h"
+
+namespace pixpat
+{
+
+// Pattern-specific parameters parsed from pixpat_pattern_opts::params.
+// The wire format is a comma-separated list of "key=value" items;
+// whitespace around tokens is trimmed, keys and values are
+// case-insensitive ASCII, and neither may contain ',' or '='.
+// Malformed input leaves ok() returning false — the pattern dispatcher
+// fails the call when that happens.
+//
+// Patterns query keys by name via get() / get_int() / get_hex_color().
+// Unknown keys are ignored: each pattern handles forward compatibility,
+// not the parser.
+class Params
+{
+public:
+	explicit Params(const char* csv);
+
+	bool ok() const noexcept {
+		return ok_;
+	}
+
+	std::optional<std::string_view> get(std::string_view key) const noexcept;
+	std::optional<int> get_int(std::string_view key) const noexcept;
+	std::optional<RGB16> get_hex_color(std::string_view key) const noexcept;
+
+private:
+	std::vector<std::pair<std::string, std::string> > kv_;
+	bool ok_{ true };
+};
+
+namespace detail
+{
+
+inline char ascii_tolower(char c) noexcept
+{
+	return (c >= 'A' && c <= 'Z') ? char(c + ('a' - 'A')) : c;
+}
+
+inline std::string_view trim(std::string_view s) noexcept
+{
+	while (!s.empty() && std::isspace(static_cast<unsigned char>(s.front())))
+		s.remove_prefix(1);
+	while (!s.empty() && std::isspace(static_cast<unsigned char>(s.back())))
+		s.remove_suffix(1);
+	return s;
+}
+
+inline bool ieq(std::string_view a, std::string_view b) noexcept
+{
+	if (a.size() != b.size())
+		return false;
+	for (size_t i = 0; i < a.size(); ++i)
+		if (ascii_tolower(a[i]) != ascii_tolower(b[i]))
+			return false;
+	return true;
+}
+
+} // namespace detail
+
+
+inline Params::Params(const char* csv)
+{
+	if (!csv || *csv == '\0')
+		return;
+
+	std::string_view s(csv);
+	while (!s.empty()) {
+		const size_t comma = s.find(',');
+		std::string_view item = (comma == std::string_view::npos)
+		                                ? s : s.substr(0, comma);
+		s = (comma == std::string_view::npos)
+		            ? std::string_view{} : s.substr(comma + 1);
+
+		item = detail::trim(item);
+		if (item.empty()) {
+			ok_ = false;
+			return;
+		}
+
+		const size_t eq = item.find('=');
+		if (eq == std::string_view::npos) {
+			ok_ = false;
+			return;
+		}
+		const std::string_view k = detail::trim(item.substr(0, eq));
+		const std::string_view v = detail::trim(item.substr(eq + 1));
+		if (k.empty()) {
+			ok_ = false;
+			return;
+		}
+		kv_.emplace_back(std::string(k), std::string(v));
+	}
+}
+
+inline std::optional<std::string_view>
+Params::get(std::string_view key) const noexcept
+{
+	for (const auto& [k, v] : kv_)
+		if (detail::ieq(k, key))
+			return std::string_view(v);
+	return std::nullopt;
+}
+
+inline std::optional<int>
+Params::get_int(std::string_view key) const noexcept
+{
+	auto v = get(key);
+	if (!v || v->empty())
+		return std::nullopt;
+	int sign = 1;
+	size_t i = 0;
+	if ((*v)[0] == '-') { sign = -1; ++i; }
+	else if ((*v)[0] == '+') { ++i; }
+	if (i == v->size())
+		return std::nullopt;
+	int out = 0;
+	for (; i < v->size(); ++i) {
+		const char c = (*v)[i];
+		if (c < '0' || c > '9')
+			return std::nullopt;
+		out = out * 10 + (c - '0');
+	}
+	return sign * out;
+}
+
+// Parses a hex color string. The optional `0x`/`0X` prefix is allowed.
+// The number of hex digits after the prefix selects the layout:
+//   6 digits  — 8-bit  RRGGBB              (alpha defaults to opaque)
+//   8 digits  — 8-bit  AARRGGBB            (alpha-first)
+//  12 digits  — 16-bit RRRRGGGGBBBB        (alpha defaults to opaque)
+//  16 digits  — 16-bit AAAARRRRGGGGBBBB    (alpha-first)
+// 8-bit components are byte-replicated to the normalized 16-bit form
+// (0xFF → 0xFFFF); 16-bit components are stored directly. Any other
+// length, malformed digits, or stray separators yield std::nullopt.
+inline std::optional<RGB16>
+Params::get_hex_color(std::string_view key) const noexcept
+{
+	auto v = get(key);
+	if (!v)
+		return std::nullopt;
+
+	std::string_view s = *v;
+	if (s.size() >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
+		s.remove_prefix(2);
+
+	const auto digit = [](char c) -> int {
+				   if (c >= '0' && c <= '9') return c - '0';
+				   if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+				   if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+				   return -1;
+			   };
+	const auto parse_n = [&](size_t off, size_t n) -> std::optional<unsigned> {
+				     unsigned out = 0;
+				     for (size_t i = 0; i < n; ++i) {
+					     const int d = digit(s[off + i]);
+					     if (d < 0)
+						     return std::nullopt;
+					     out = (out << 4) | unsigned(d);
+				     }
+				     return out;
+			     };
+
+	bool has_alpha;
+	bool is_16bpc;
+	switch (s.size()) {
+	case 6:  has_alpha = false; is_16bpc = false; break;
+	case 8:  has_alpha = true;  is_16bpc = false; break;
+	case 12: has_alpha = false; is_16bpc = true;  break;
+	case 16: has_alpha = true;  is_16bpc = true;  break;
+	default: return std::nullopt;
+	}
+
+	const size_t per = is_16bpc ? 4 : 2;
+	const unsigned full = is_16bpc ? 0xFFFFu : 0xFFu;
+	unsigned a = full, r, g, b;
+	size_t off = 0;
+	if (has_alpha) {
+		auto av = parse_n(off, per);
+		if (!av) return std::nullopt;
+		a = *av;
+		off += per;
+	}
+	auto rv = parse_n(off, per);
+	if (!rv) return std::nullopt;
+	r = *rv;
+	off += per;
+	auto gv = parse_n(off, per);
+	if (!gv) return std::nullopt;
+	g = *gv;
+	off += per;
+	auto bv = parse_n(off, per);
+	if (!bv) return std::nullopt;
+	b = *bv;
+
+	if (is_16bpc) {
+		return RGB16{ uint16_t(r), uint16_t(g), uint16_t(b), uint16_t(a) };
+	} else {
+		const auto rep = [](unsigned x) noexcept {
+					 return uint16_t((x << 8) | x);
+				 };
+		return RGB16{ rep(r), rep(g), rep(b), rep(a) };
+	}
+}
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/pattern.h b/subprojects/pixpat/pixpat-native/src/pattern.h
new file mode 100644
index 0000000..fbee683
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pattern.h
@@ -0,0 +1,597 @@
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+
+#include "color.h"
+#include "layout.h"
+#include "params.h"
+
+namespace pixpat::patterns
+{
+
+// Patterns emit opaque pixels (a=kNormMax) unless they encode their
+// own alpha (e.g. `plain`'s ARGB form). Alpha-bearing sinks
+// (ARGB8888 etc) therefore see the pattern's chosen alpha; convert
+// paths propagate the source's actual `a` instead (a=0 for X-only
+// sources).
+//
+// A pattern is an instance with:
+//   using Pixel = RGB16 | YUV16;
+//   explicit Pat(const Params&) noexcept;
+//   Pixel sample(size_t x, size_t y, size_t W, size_t H) const noexcept;
+//   bool ready() const noexcept;   // optional, default true
+// Patterns that don't read params ignore the constructor argument.
+
+namespace detail
+{
+// 8-bit -> normalized 16 byte-replication. e.g. 255 -> 0xFFFF,
+// 1 -> 0x0101.
+constexpr RGB16 rgb8(uint8_t r, uint8_t g, uint8_t b) noexcept
+{
+	return RGB16{
+	        uint16_t((uint16_t(r) << 8) | r),
+	        uint16_t((uint16_t(g) << 8) | g),
+	        uint16_t((uint16_t(b) << 8) | b),
+	        kNormMax,
+	};
+}
+
+// 12-bit -> normalized 16 bit-replication.
+constexpr YUV16 yuv12(uint16_t y, uint16_t u, uint16_t v) noexcept
+{
+	return YUV16{
+	        uint16_t((y << 4) | (y >> 8)),
+	        uint16_t((u << 4) | (u >> 8)),
+	        uint16_t((v << 4) | (v >> 8)),
+	        kNormMax,
+	};
+}
+} // namespace detail
+
+// "kmstest" default pattern: white border + diagonals; blue rails on
+// the top/left edges; red rails on the bottom/right; an 8-step color
+// gradient block in the center.
+struct Kmstest {
+	using Pixel = RGB16;
+
+	explicit Kmstest(const Params&) noexcept {
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		using detail::rgb8;
+		const size_t mw = 20;
+		const size_t xm1 = mw;
+		const size_t xm2 = W - mw - 1;
+		const size_t ym1 = mw;
+		const size_t ym2 = H - mw - 1;
+
+		if (x == xm1 || x == xm2 || y == ym1 || y == ym2)
+			return rgb8(255, 255, 255);
+		if (x < xm1 && y < ym1)
+			return rgb8(255, 255, 255);
+		if ((x == 0 || x == W - 1) && (y < ym1 || y > ym2))
+			return rgb8(255, 255, 255);
+		if ((y == 0 || y == H - 1) && (x < xm1 || x > xm2))
+			return rgb8(255, 255, 255);
+		if (x < xm1 && (y > ym1 && y < ym2))
+			return rgb8(0, 0, 255);
+		if (y < ym1 && (x > xm1 && x < xm2))
+			return rgb8(0, 0, 255);
+		if (x > xm2 && (y > ym1 && y < ym2))
+			return rgb8(255, 0, 0);
+		if (y > ym2 && (x > xm1 && x < xm2))
+			return rgb8(255, 0, 0);
+		if (x > xm1 && x < xm2 && y > ym1 && y < ym2) {
+			if (x == y || W - x == H - y)
+				return rgb8(255, 255, 255);
+			if (W - x - 1 == y || x == H - y - 1)
+				return rgb8(255, 255, 255);
+			const int t = int((x - xm1 - 1) * 8 / (xm2 - xm1 - 1));
+			const unsigned c = unsigned((y - ym1 - 1) % 256);
+			unsigned r = 0, g = 0, b = 0;
+			switch (t) {
+			case 0: r = c; break;
+			case 1: g = c; break;
+			case 2: b = c; break;
+			case 3: g = b = c; break;
+			case 4: r = b = c; break;
+			case 5: r = g = c; break;
+			case 6: r = g = b = c; break;
+			case 7: break;
+			}
+			return rgb8(uint8_t(r), uint8_t(g), uint8_t(b));
+		}
+		return rgb8(0, 0, 0);
+	}
+};
+
+// SMPTE RP 219-1:2014 color bar pattern. Emits YUV directly with
+// pixel values defined by the spec in BT.709 / Limited range. Pass
+// `rec=BT709, range=Limited` for spec-correct output; other ColorSpec
+// settings produce visibly-wrong colors when the sink crosses to RGB
+// (the matrix the caller picked is applied to BT.709-encoded values).
+// Callers are trusted — pixpat does not override the spec for them.
+struct Smpte {
+	using Pixel = YUV16;
+
+	explicit Smpte(const Params&) noexcept {
+	}
+
+	YUV16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		using detail::yuv12;
+		constexpr YUV16 gray40    = yuv12(1658, 2048, 2048);
+		constexpr YUV16 white75   = yuv12(2884, 2048, 2048);
+		constexpr YUV16 yellow75  = yuv12(2694,  704, 2171);
+		constexpr YUV16 cyan75    = yuv12(2325, 2356,  704);
+		constexpr YUV16 green75   = yuv12(2136, 1012,  827);
+		constexpr YUV16 magenta75 = yuv12(1004, 3084, 3269);
+		constexpr YUV16 red75     = yuv12( 815, 1740, 3392);
+		constexpr YUV16 blue75    = yuv12( 446, 3392, 1925);
+		constexpr YUV16 cyan100   = yuv12(3015, 2459,  256);
+		constexpr YUV16 blue100   = yuv12( 509, 3840, 1884);
+		constexpr YUV16 yellow100 = yuv12(3507,  256, 2212);
+		constexpr YUV16 black     = yuv12( 256, 2048, 2048);
+		constexpr YUV16 white100  = yuv12(3760, 2048, 2048);
+		constexpr YUV16 red100    = yuv12(1001, 1637, 3840);
+		constexpr YUV16 gray15    = yuv12( 782, 2048, 2048);
+
+		constexpr YUV16 black_m2  = yuv12( 186, 2048, 2048);
+		constexpr YUV16 black_p2  = yuv12( 326, 2048, 2048);
+		constexpr YUV16 black_p4  = yuv12( 396, 2048, 2048);
+
+		constexpr size_t M = 1024;
+		const size_t xs = x * M;
+		const size_t a  = W * M;
+		const size_t c  = (a * 3 / 4) / 7;
+		const size_t d  = a / 8;
+
+		const size_t pattern1_height = (H * 7) / 12;
+		const size_t pattern2_height = pattern1_height + (H / 12);
+		const size_t pattern3_height = pattern2_height + (H / 12);
+
+		if (y < pattern1_height) {
+			if (xs < d || xs >= (a - d))
+				return gray40;
+			const size_t bar = (xs - d) / c;
+			switch (bar) {
+			case 0: return white75;
+			case 1: return yellow75;
+			case 2: return cyan75;
+			case 3: return green75;
+			case 4: return magenta75;
+			case 5: return red75;
+			default: return blue75;
+			}
+		}
+
+		if (y < pattern2_height) {
+			if (xs < d)         return cyan100;
+			if (xs >= (a - d))  return blue100;
+			return white75;
+		}
+
+		if (y < pattern3_height) {
+			if (xs < d)         return yellow100;
+			if (xs >= (a - d))  return red100;
+			const size_t ramp_w = a - 2 * d;
+			const size_t ramp_x = xs - d;
+			const uint16_t y_val = uint16_t(256 + (3760 - 256) * ramp_x / ramp_w);
+			return yuv12(y_val, 2048, 2048);
+		}
+
+		// pattern4 (PLUGE)
+		const size_t c0 = d;
+		const size_t c1 = c0 + c * 3 / 2;
+		const size_t c2 = c1 + 2 * c;
+		const size_t c3 = c2 + c * 5 / 6;
+
+		if (xs < c0)            return gray15;
+		if (xs < c1)            return black;
+		if (xs < c2)            return white100;
+		if (xs < c3)            return black;
+		if (xs >= a - d)        return gray15;
+		if (xs >= a - d - c)    return black;
+
+		const size_t step = (xs - c3) / (c / 3);
+		switch (step) {
+		case 0: return black_m2;
+		case 1: return black;
+		case 2: return black_p2;
+		case 3: return black;
+		default: return black_p4;
+		}
+	}
+};
+
+// Solid fill from a hex color string. Reads `color=<hex>` from
+// params; the value is parsed by Params::get_hex_color (8/16-bpc,
+// alpha-first if present, optional `0x` prefix). Missing or
+// malformed `color` leaves ready()=false and the dispatcher fails
+// the call.
+struct Plain {
+	using Pixel = RGB16;
+
+	explicit Plain(const Params& p) noexcept
+	{
+		if (auto c = p.get_hex_color("color")) {
+			color_ = *c;
+			ready_ = true;
+		}
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	RGB16 sample(size_t, size_t, size_t, size_t) const noexcept
+	{
+		return color_;
+	}
+
+private:
+	RGB16 color_{};
+	bool ready_{ false };
+};
+
+namespace detail
+{
+// Linear ramp 0..kNormMax across [0, span-1]. span<=1 returns kNormMax.
+constexpr uint16_t ramp16(size_t pos, size_t span) noexcept
+{
+	if (span <= 1)
+		return kNormMax;
+	return uint16_t((uint64_t(pos) * kNormMax) / (span - 1));
+}
+} // namespace detail
+
+// Black/white checkerboard. Reads optional `cell=<N>` (positive
+// integer; default 8) for cell size in pixels.
+struct Checker {
+	using Pixel = RGB16;
+
+	explicit Checker(const Params& p) noexcept
+	{
+		if (p.get("cell")) {
+			auto n = p.get_int("cell");
+			if (!n || *n <= 0) {
+				ready_ = false;
+				return;
+			}
+			cell_ = size_t(*n);
+		}
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t, size_t) const noexcept
+	{
+		const bool dark = (((x / cell_) ^ (y / cell_)) & 1u) != 0;
+		return dark ? RGB16{ 0, 0, 0, kNormMax }
+		            : RGB16{ kNormMax, kNormMax, kNormMax, kNormMax };
+	}
+
+private:
+	size_t cell_{ 8 };
+	bool ready_{ true };
+};
+
+namespace detail
+{
+// Pick one of (R, G, B, gray) given a stripe index in [0, 4) and a
+// scalar ramp value. Used by hramp/vramp.
+constexpr RGB16 rgb_gray_stripe(size_t stripe, uint16_t v) noexcept
+{
+	switch (stripe) {
+	case 0:  return RGB16{ v, 0, 0, kNormMax };
+	case 1:  return RGB16{ 0, v, 0, kNormMax };
+	case 2:  return RGB16{ 0, 0, v, kNormMax };
+	default: return RGB16{ v, v, v, kNormMax };
+	}
+}
+} // namespace detail
+
+// Four horizontal stripes — R, G, B, gray — each a 0..max ramp
+// along x. Per-channel and luma quantization in one pattern.
+struct Hramp {
+	using Pixel = RGB16;
+
+	explicit Hramp(const Params&) noexcept {
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		const size_t stripe = (H == 0) ? 0 : (y * 4) / H;
+		return detail::rgb_gray_stripe(stripe, detail::ramp16(x, W));
+	}
+};
+
+// Four vertical columns — R, G, B, gray — each a 0..max ramp
+// along y. Same coverage as hramp, rotated 90°.
+struct Vramp {
+	using Pixel = RGB16;
+
+	explicit Vramp(const Params&) noexcept {
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		const size_t col = (W == 0) ? 0 : (x * 4) / W;
+		return detail::rgb_gray_stripe(col, detail::ramp16(y, H));
+	}
+};
+
+// Diagonal RGB ramp: R sweeps with x, G with y, B with x+y.
+struct Dramp {
+	using Pixel = RGB16;
+
+	explicit Dramp(const Params&) noexcept {
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		const uint16_t r = detail::ramp16(x, W);
+		const uint16_t g = detail::ramp16(y, H);
+		const size_t span = (W + H >= 2) ? (W + H - 1) : 1;
+		const uint16_t b = detail::ramp16(x + y, span);
+		return RGB16{ r, g, b, kNormMax };
+	}
+};
+
+namespace detail
+{
+// Seven-region color sequence used by hbar/vbar:
+// white, red, white, green, white, blue, white. The white separators
+// between R/G/B make per-channel offsets at the band boundaries
+// visible.
+constexpr RGB16 bar_color7(size_t band) noexcept
+{
+	switch (band) {
+	case 1:  return rgb8(255,   0,   0);
+	case 3:  return rgb8(  0, 255,   0);
+	case 5:  return rgb8(  0,   0, 255);
+	default: return rgb8(255, 255, 255);
+	}
+}
+} // namespace detail
+
+// Vertical bar (full image height, narrow along x) over a black
+// background. `pos` is the left edge in pixels (signed; negative
+// values clip at the left edge); `width` is the bar thickness in
+// pixels (default 32). The bar is split into 7 equal-height regions
+// colored white/red/white/green/white/blue/white.
+struct VBarRGB {
+	using Pixel = RGB16;
+
+	explicit VBarRGB(const Params& p) noexcept
+	{
+		auto pp = p.get_int("pos");
+		if (!pp) {
+			ready_ = false;
+			return;
+		}
+		pos_ = *pp;
+		if (p.get("width")) {
+			auto w = p.get_int("width");
+			if (!w || *w <= 0) {
+				ready_ = false;
+				return;
+			}
+			width_ = size_t(*w);
+		}
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t, size_t H) const noexcept
+	{
+		const long long sx = static_cast<long long>(x);
+		const long long lo = pos_;
+		const long long hi = lo + static_cast<long long>(width_);
+		if (sx < lo || sx >= hi)
+			return detail::rgb8(0, 0, 0);
+		const size_t band = (H == 0) ? 0 : (y * 7) / H;
+		return detail::bar_color7(band);
+	}
+
+private:
+	int pos_{};
+	size_t width_{ 32 };
+	bool ready_{ true };
+};
+
+// Horizontal bar: vbar rotated 90°. `pos` is the top edge in pixels;
+// `width` is the bar thickness in pixels (default 32). The bar spans
+// the full image width and is split into 7 equal-width regions
+// colored white/red/white/green/white/blue/white.
+struct HBarRGB {
+	using Pixel = RGB16;
+
+	explicit HBarRGB(const Params& p) noexcept
+	{
+		auto pp = p.get_int("pos");
+		if (!pp) {
+			ready_ = false;
+			return;
+		}
+		pos_ = *pp;
+		if (p.get("width")) {
+			auto w = p.get_int("width");
+			if (!w || *w <= 0) {
+				ready_ = false;
+				return;
+			}
+			width_ = size_t(*w);
+		}
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t) const noexcept
+	{
+		const long long sy = static_cast<long long>(y);
+		const long long lo = pos_;
+		const long long hi = lo + static_cast<long long>(width_);
+		if (sy < lo || sy >= hi)
+			return detail::rgb8(0, 0, 0);
+		const size_t band = (W == 0) ? 0 : (x * 7) / W;
+		return detail::bar_color7(band);
+	}
+
+private:
+	int pos_{};
+	size_t width_{ 32 };
+	bool ready_{ true };
+};
+
+// Same shape as VBarRGB but emits YUV16 directly. The five unique colors
+// (black bg + white/red/green/blue bar regions) are precomputed from
+// `spec` at construction so the cross-kind pass is a no-op when the
+// sink is YUV. Use the RGB-native `VBarRGB` for RGB sinks instead — it
+// avoids the YUV→RGB pass that this variant would incur there.
+struct VBarYUV {
+	using Pixel = YUV16;
+
+	explicit VBarYUV(const Params& p, ColorSpec spec) noexcept
+	{
+		auto pp = p.get_int("pos");
+		if (!pp) {
+			ready_ = false;
+			return;
+		}
+		pos_ = *pp;
+		if (p.get("width")) {
+			auto w = p.get_int("width");
+			if (!w || *w <= 0) {
+				ready_ = false;
+				return;
+			}
+			width_ = size_t(*w);
+		}
+		const ColorCoeffs c = coeffs_for(spec);
+		using X = ColorXfm<RGB16, YUV16>;
+		bg_       = X::apply(detail::rgb8(  0,   0,   0), c);
+		bands_[0] = X::apply(detail::rgb8(255, 255, 255), c);
+		bands_[1] = X::apply(detail::rgb8(255,   0,   0), c);
+		bands_[2] = bands_[0];
+		bands_[3] = X::apply(detail::rgb8(  0, 255,   0), c);
+		bands_[4] = bands_[0];
+		bands_[5] = X::apply(detail::rgb8(  0,   0, 255), c);
+		bands_[6] = bands_[0];
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	YUV16 sample(size_t x, size_t y, size_t, size_t H) const noexcept
+	{
+		const long long sx = static_cast<long long>(x);
+		const long long lo = pos_;
+		const long long hi = lo + static_cast<long long>(width_);
+		if (sx < lo || sx >= hi)
+			return bg_;
+		const size_t band = (H == 0) ? 0 : (y * 7) / H;
+		return bands_[band];
+	}
+
+private:
+	YUV16 bg_{};
+	YUV16 bands_[7]{};
+	int pos_{};
+	size_t width_{ 32 };
+	bool ready_{ true };
+};
+
+// YUV-native counterpart to HBarRGB. See VBarYUV.
+struct HBarYUV {
+	using Pixel = YUV16;
+
+	explicit HBarYUV(const Params& p, ColorSpec spec) noexcept
+	{
+		auto pp = p.get_int("pos");
+		if (!pp) {
+			ready_ = false;
+			return;
+		}
+		pos_ = *pp;
+		if (p.get("width")) {
+			auto w = p.get_int("width");
+			if (!w || *w <= 0) {
+				ready_ = false;
+				return;
+			}
+			width_ = size_t(*w);
+		}
+		const ColorCoeffs c = coeffs_for(spec);
+		using X = ColorXfm<RGB16, YUV16>;
+		bg_       = X::apply(detail::rgb8(  0,   0,   0), c);
+		bands_[0] = X::apply(detail::rgb8(255, 255, 255), c);
+		bands_[1] = X::apply(detail::rgb8(255,   0,   0), c);
+		bands_[2] = bands_[0];
+		bands_[3] = X::apply(detail::rgb8(  0, 255,   0), c);
+		bands_[4] = bands_[0];
+		bands_[5] = X::apply(detail::rgb8(  0,   0, 255), c);
+		bands_[6] = bands_[0];
+	}
+
+	bool ready() const noexcept {
+		return ready_;
+	}
+
+	YUV16 sample(size_t x, size_t y, size_t W, size_t) const noexcept
+	{
+		const long long sy = static_cast<long long>(y);
+		const long long lo = pos_;
+		const long long hi = lo + static_cast<long long>(width_);
+		if (sy < lo || sy >= hi)
+			return bg_;
+		const size_t band = (W == 0) ? 0 : (x * 7) / W;
+		return bands_[band];
+	}
+
+private:
+	YUV16 bg_{};
+	YUV16 bands_[7]{};
+	int pos_{};
+	size_t width_{ 32 };
+	bool ready_{ true };
+};
+
+// Centered radial cosine zone plate: 0.5 + 0.5 * cos(k * (cx² + cy²))
+// with cx, cy measured from the image center and k chosen so the
+// local frequency hits Nyquist at the longer edge — i.e. the pattern
+// uses every spatial frequency the grid can resolve.
+struct Zoneplate {
+	using Pixel = RGB16;
+
+	explicit Zoneplate(const Params&) noexcept {
+	}
+
+	RGB16 sample(size_t x, size_t y, size_t W, size_t H) const noexcept
+	{
+		const double max_dim = double(W > H ? W : H);
+		// Local frequency d(k r²)/dr = 2 k r. At r = max_dim/2 the
+		// frequency reaches π/pixel (Nyquist), giving k = π / max_dim.
+		const double k = 3.14159265358979323846 / (max_dim > 0 ? max_dim : 1.0);
+		const double cx = double(x) - 0.5 * double(W);
+		const double cy = double(y) - 0.5 * double(H);
+		const double phase = k * (cx * cx + cy * cy);
+		const double v = 0.5 + 0.5 * std::cos(phase);
+		const double scaled = v * 65535.0;
+		const uint16_t g = (scaled < 0.0)        ? uint16_t(0)
+		                 : (scaled > 65535.0)    ? kNormMax
+		                                         : uint16_t(scaled + 0.5);
+		return RGB16{ g, g, g, kNormMax };
+	}
+};
+
+} // namespace pixpat::patterns
diff --git a/subprojects/pixpat/pixpat-native/src/pattern_catalog.h b/subprojects/pixpat/pixpat-native/src/pattern_catalog.h
new file mode 100644
index 0000000..6576b2b
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pattern_catalog.h
@@ -0,0 +1,64 @@
+#pragma once
+
+// Catalog of every named pattern the C++ side knows. Mirrors the
+// shape of format_catalog.h. The X-macro is a list of
+// (Label, RgbType, YuvType, "name") rows:
+//
+//   X(Label, RgbType, YuvType, "name")
+//
+// `Label` is the C++ identifier doubling as the PatternId enum value
+// and the s_pattern_caps[] index. `RgbType` and `YuvType` resolve to
+// classes in `pixpat::patterns::` (defined in pattern.h) that satisfy
+// the pattern interface (sample(), Pixel) — one per color kind. Use
+// `void` if the pattern has no variant in that kind. At least one
+// must be non-void. When both are present, dispatch_draw_pattern
+// picks the variant matching the sink's color kind so the cross-kind
+// pass is a no-op; when only one is present, the pipeline runs the
+// cross-kind pass for the opposite-kind sinks. `name` is the
+// lowercase identifier exposed via the C ABI.
+//
+// Adding a pattern = a row here AND its class(es) in pattern.h. The
+// codegen (pixpat-native/codegen/gen_pixpat.py) parses this X-macro
+// to learn the pattern set; pixpat_pattern.cpp re-expands it to build
+// the dispatch arms and the default-pattern fallback.
+
+#include <cstddef>
+#include <cstdint>
+
+namespace pixpat
+{
+
+#define PIXPAT_PATTERN_LIST(X)                        \
+	X(Kmstest,   Kmstest,   void,      "kmstest") \
+	X(Smpte,     void,      Smpte,     "smpte")   \
+	X(Plain,     Plain,     void,      "plain")   \
+	X(Checker,   Checker,   void,      "checker") \
+	X(Hramp,     Hramp,     void,      "hramp")   \
+	X(Vramp,     Vramp,     void,      "vramp")   \
+	X(HBar,      HBarRGB,   HBarYUV,   "hbar")    \
+	X(VBar,      VBarRGB,   VBarYUV,   "vbar")    \
+	X(Dramp,     Dramp,     void,      "dramp")   \
+	X(Zoneplate, Zoneplate, void,      "zoneplate")
+
+enum class PatternId : uint8_t {
+#define X(label, rgb, yuv, name) label,
+	PIXPAT_PATTERN_LIST(X)
+#undef X
+	Unknown,
+};
+
+struct PatternEntry {
+	const char* name;
+	PatternId id;
+};
+
+inline constexpr PatternEntry s_pattern_table[] = {
+#define X(label, rgb, yuv, name) { name, PatternId::label },
+	PIXPAT_PATTERN_LIST(X)
+#undef X
+};
+
+inline constexpr size_t s_pattern_catalog_count =
+	sizeof(s_pattern_table) / sizeof(s_pattern_table[0]);
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/pipeline.h b/subprojects/pixpat/pixpat-native/src/pipeline.h
new file mode 100644
index 0000000..09e13bc
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pipeline.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cstddef>
+
+#include "color.h"
+#include "layout.h"
+
+// Inlined source → color → sink composition. The intermediate Pixel
+// values stay in registers across stages; there is no normalized RGB16
+// or YUV16 buffer between source and sink. Block size is dictated by
+// the sink: 1x1 for non-subsampled formats, h_sub × v_sub for chroma-
+// subsampled ones.
+
+namespace pixpat
+{
+
+template <typename Source, typename Sink>
+struct Converter {
+	using Xfm = ColorXfm<typename Source::Pixel, typename Sink::Pixel>;
+	static constexpr size_t bh = Sink::block_h;
+	static constexpr size_t bw = Sink::block_w;
+
+	static void run(const Buffer<Source::Layout::num_planes>& src,
+	                Buffer<Sink::Layout::num_planes>& dst,
+	                size_t W, size_t H,
+	                size_t by_start, size_t by_end,
+	                ColorSpec spec) noexcept
+	{
+		const ColorCoeffs c = coeffs_for(spec);
+		for (size_t by = by_start; by < by_end; by += bh) {
+			for (size_t bx = 0; bx < W; bx += bw) {
+				typename Sink::Pixel block[bh][bw];
+				for (size_t dy = 0; dy < bh; ++dy)
+					for (size_t dx = 0; dx < bw; ++dx)
+						block[dy][dx] = Xfm::apply(
+							Source::read(src, bx + dx, by + dy,
+							             W, H), c);
+				Sink::write_block(dst, bx, by, block);
+			}
+		}
+	}
+};
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/pixpat.cpp b/subprojects/pixpat/pixpat-native/src/pixpat.cpp
new file mode 100644
index 0000000..ac21fac
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pixpat.cpp
@@ -0,0 +1,355 @@
+// pixpat: extern "C" entry points + runtime format dispatch.
+//
+// The format catalog (X-macro PIXPAT_FORMAT_LIST + FormatId enum +
+// s_format_table) is hand-written in format_catalog.h. The generator
+// (pixpat-native/codegen/gen_pixpat.py) reads the same X-macro and
+// the user TOML and emits the per-config bits:
+//
+//   pixpat_config.h — PIXPAT_FEATURE_PATTERN / _CONVERT
+//   pixpat_caps.inc — s_format_caps[] (per-format readable / writable /
+//                     hot_src / hot_dst, indexed by FormatId) and
+//                     s_pattern_caps[] (per-pattern enabled flag).
+//
+// The convert and pattern dispatch (dispatch_convert in
+// pixpat_convert.cpp, dispatch_draw_pattern in pixpat_pattern.cpp) is
+// hand-written and consumes s_format_caps / s_pattern_caps via
+// `if constexpr` on the per-row constexpr fields.
+//
+// s_format_info is built here, once, by re-expanding the catalog
+// X-macro through unpack_for / pack_for / snk_block_h_for /
+// snk_block_w_for. Those constexpr helpers use `if constexpr` on the
+// per-format readable / writable flags from s_format_caps to either
+// take the address of unpack_to_norm / pack_from_norm or fall back to
+// nullptr (or 0). Because they're function templates, the discarded
+// branch is never instantiated, so disabled-direction templates
+// produce no code.
+//
+// Feature gating is meson-side: pixpat_pattern.cpp / pixpat_convert.cpp
+// are added to the source list only when their feature is enabled. This
+// file's entry points always exist; they call the bridge functions
+// dispatch_draw_pattern / dispatch_convert under `if constexpr
+// (kFeatureXxx)`. The discarded if-constexpr branch produces no symbol
+// reference, so when the matching TU is absent the link still succeeds
+// and the entry point returns -1 instead.
+
+#include <pixpat/pixpat.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "pixpat_config.h"
+
+#include "color.h"
+#include "error.h"
+#include "format_catalog.h"
+#include "formats.h"
+#include "io.h"
+#include "layout.h"
+#include "params.h"
+#include "pattern.h"
+#include "pixpat_internal.h"
+#include "threading.h"
+
+namespace pixpat
+{
+
+inline constexpr bool kFeaturePattern = PIXPAT_FEATURE_PATTERN;
+inline constexpr bool kFeatureConvert = PIXPAT_FEATURE_CONVERT;
+
+static FormatId lookup_format(const char* name) noexcept
+{
+	if (!name)
+		return FormatId::Unknown;
+	for (const auto& e : s_format_table)
+		if (std::strcmp(e.name, name) == 0)
+			return e.id;
+	return FormatId::Unknown;
+}
+
+// Per-source: fill `bh` rows of normalized pixels by calling Src::read.
+// Address is taken in s_format_info[] for every readable format. When
+// no format is readable (convert disabled) no specialization is
+// instantiated, so this template emits no code.
+template <typename Src>
+static void unpack_to_norm(uint8_t* norm, const pixpat_buffer* src,
+                           size_t by, size_t bh, size_t W) noexcept
+{
+	using P = typename Src::Pixel;
+	auto sb = make_buffer<typename Src::Layout>(src);
+	auto* dst = reinterpret_cast<P*>(norm);
+	const size_t H = src->height;
+	for (size_t dy = 0; dy < bh; ++dy)
+		for (size_t x = 0; x < W; ++x)
+			dst[dy * W + x] = Src::read(sb, x, by + dy, W, H);
+}
+
+// Per-sink: re-block `Snk::block_h × W` of normalized pixels and call
+// Sink::write_block. Snk's block_h dictates how many normalized rows
+// the caller has to have prepared. Used by the normalized pivot for
+// both convert (cold path) and pattern.
+template <typename Snk>
+static void pack_from_norm(const pixpat_buffer* dst,
+                           const uint8_t* norm,
+                           size_t by, size_t W) noexcept
+{
+	using P = typename Snk::Pixel;
+	constexpr size_t bh = Snk::block_h;
+	constexpr size_t bw = Snk::block_w;
+	auto db = make_buffer<typename Snk::Layout>(dst);
+	auto* src = reinterpret_cast<const P*>(norm);
+	for (size_t bx = 0; bx < W; bx += bw) {
+		P block[bh][bw];
+		for (size_t dy = 0; dy < bh; ++dy)
+			for (size_t dx = 0; dx < bw; ++dx)
+				block[dy][dx] = src[dy * W + bx + dx];
+		Snk::write_block(db, bx, by, block);
+	}
+}
+
+// Generated: s_format_caps[] indexed by FormatId, plus s_pattern_* /
+// DefaultPattern (used only by pixpat_pattern.cpp; harmless here).
+#include "pixpat_caps.inc"
+
+static_assert(sizeof(s_format_caps) / sizeof(s_format_caps[0]) == s_format_catalog_count,
+              "s_format_caps must cover the full catalog");
+
+// `if constexpr` keeps disabled-direction function-template bodies
+// uninstantiated. Taking `&unpack_to_norm<Src>` / `&pack_from_norm<Snk>`
+// forces the function body to be emitted; without the gate every
+// catalog format would carry unpack and pack code regardless of its
+// readable / writable bit. Snk::block_h / Snk::block_w are constexpr
+// scalars — no body, no emission — so they're inlined directly in the
+// initializer below, without a helper.
+template <bool Read, typename Src>
+static constexpr UnpackFn unpack_for() noexcept
+{
+	if constexpr (Read)
+		return &unpack_to_norm<Src>;
+	else
+		return nullptr;
+}
+
+template <bool Write, typename Snk>
+static constexpr PackFn pack_for() noexcept
+{
+	if constexpr (Write)
+		return &pack_from_norm<Snk>;
+	else
+		return nullptr;
+}
+
+const FormatInfo s_format_info[] = {
+#define CAPS(name) s_format_caps[size_t(FormatId::name)]
+#define X(name)                                                           \
+	{                                                                 \
+		unpack_for<CAPS(name).readable, formats::name::Source>(), \
+		pack_for<CAPS(name).writable, formats::name::Sink>(),     \
+		formats::name::kind,                                      \
+		uint8_t(formats::name::h_sub),                            \
+		uint8_t(formats::name::v_sub),                            \
+		uint8_t(formats::name::Sink::block_h),                    \
+		uint8_t(formats::name::Sink::block_w),                    \
+	},
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+#undef CAPS
+};
+static_assert(sizeof(s_format_info) / sizeof(s_format_info[0]) == s_format_catalog_count,
+              "s_format_info must cover the full catalog");
+
+// validate_* / parse_spec are only reached from inside the entry points'
+// `if constexpr (kFeatureXxx)` true branches. With a feature disabled,
+// its caller's branch is discarded and the helper becomes unreferenced;
+// require_readable is convert-only. [[maybe_unused]] keeps
+// -Wunused-function (and clang's -Wunneeded-internal-declaration) quiet.
+[[maybe_unused]] static void validate_buffer(const pixpat_buffer* b)
+{
+	if (!b)
+		throw invalid_argument("null buffer");
+	if (b->width == 0 || b->height == 0)
+		throw invalid_argument("zero-sized buffer");
+}
+
+[[maybe_unused]] static FormatId validate_format(const char* name)
+{
+	auto id = lookup_format(name);
+	if (id == FormatId::Unknown)
+		throw invalid_argument("unknown format");
+	return id;
+}
+
+[[maybe_unused]] static void require_writable(FormatId id)
+{
+	if (s_format_info[size_t(id)].pack == nullptr)
+		throw invalid_argument("format not enabled as a sink in this build");
+}
+
+[[maybe_unused]] static void require_readable(FormatId id)
+{
+	if (s_format_info[size_t(id)].unpack == nullptr)
+		throw invalid_argument("format not enabled as a source in this build");
+}
+
+[[maybe_unused]] static unsigned validate_thread_count(int n)
+{
+	if (n < 0)
+		throw invalid_argument("negative num_threads");
+	return n > 0 ? static_cast<unsigned>(n) : default_thread_count();
+}
+
+// Map the C-side pixpat_rec / pixpat_range enums (defined in
+// pixpat.h with explicit values 0/1/2 for rec, 0/1 for range) onto
+// the internal pixpat::Rec / pixpat::Range. Out-of-range values fall
+// back to BT.601 / Limited — matching the zero-initialised opts
+// struct and kDefaultColorSpec.
+[[maybe_unused]] static ColorSpec parse_spec(int rec_in, int range_in) noexcept
+{
+	Rec rec;
+	switch (rec_in) {
+	case PIXPAT_REC_BT709:  rec = Rec::BT709;  break;
+	case PIXPAT_REC_BT2020: rec = Rec::BT2020; break;
+	default:                rec = Rec::BT601;  break;
+	}
+	Range range = (range_in == PIXPAT_RANGE_FULL) ? Range::Full : Range::Limited;
+	return ColorSpec{ rec, range };
+}
+
+} // namespace pixpat
+
+// Marks the C entry points as part of the public ABI: restores default
+// visibility against the build-wide -fvisibility=hidden, so they are
+// exported from libpixpat.so.
+#define PIXPAT_API __attribute__((visibility("default")))
+
+extern "C" {
+
+PIXPAT_API int pixpat_draw_pattern(const pixpat_buffer* dst,
+                                   const char* pattern,
+                                   const pixpat_pattern_opts* opts)
+{
+	if constexpr (pixpat::kFeaturePattern) {
+		try {
+			pixpat::validate_buffer(dst);
+			auto id = pixpat::validate_format(dst->format);
+			pixpat::require_writable(id);
+			const auto& di = pixpat::s_format_info[size_t(id)];
+			if (dst->width % di.snk_block_w != 0 ||
+			    dst->height % di.snk_block_h != 0)
+				throw pixpat::invalid_argument(
+					      "dimensions not aligned to format block");
+			const unsigned n_threads = opts
+			        ? pixpat::validate_thread_count(opts->num_threads)
+			        : pixpat::default_thread_count();
+			const pixpat::ColorSpec spec = opts
+			        ? pixpat::parse_spec(opts->rec, opts->range)
+			        : pixpat::kDefaultColorSpec;
+			const pixpat::Params params(opts ? opts->params : nullptr);
+			if (!params.ok())
+				throw pixpat::invalid_argument("malformed opts->params");
+
+			pixpat::run_stripes(dst->height, di.snk_block_h, n_threads,
+			                    [&](size_t y0, size_t y1) {
+					pixpat::dispatch_draw_pattern(
+						id, pattern, params, dst,
+						dst->width, dst->height, y0, y1, spec);
+				});
+			return 0;
+		} catch (const std::exception&) {
+			return -1;
+		}
+	} else {
+		(void)dst;
+		(void)pattern;
+		(void)opts;
+		return -1;
+	}
+}
+
+PIXPAT_API int pixpat_convert(const pixpat_buffer* dst,
+                              const pixpat_buffer* src,
+                              const pixpat_convert_opts* opts)
+{
+	if constexpr (pixpat::kFeatureConvert) {
+		try {
+			pixpat::validate_buffer(dst);
+			pixpat::validate_buffer(src);
+			if (src->width != dst->width || src->height != dst->height)
+				throw pixpat::invalid_argument("src/dst dimensions differ");
+
+			auto src_id = pixpat::validate_format(src->format);
+			auto dst_id = pixpat::validate_format(dst->format);
+			pixpat::require_readable(src_id);
+			pixpat::require_writable(dst_id);
+
+			const auto& si = pixpat::s_format_info[size_t(src_id)];
+			const auto& di = pixpat::s_format_info[size_t(dst_id)];
+			// Each constraint must hold independently — checking only
+			// max() would miss e.g. h_sub=2 vs snk_block_w=3 with W=3.
+			if (src->width % si.h_sub != 0 || src->height % si.v_sub != 0 ||
+			    src->width % di.h_sub != 0 || src->height % di.v_sub != 0 ||
+			    src->width % di.snk_block_w != 0 || src->height % di.snk_block_h != 0)
+				throw pixpat::invalid_argument(
+					      "dimensions not aligned to format subsampling");
+			// run_stripes only needs the v dimension. Stripes must align
+			// to si.v_sub (source reads) and di.snk_block_h (sink block
+			// loop); for pixpat's catalog these are powers-of-two and
+			// max == LCM.
+			const unsigned vs = std::max({ unsigned(si.v_sub),
+			                               unsigned(di.v_sub),
+			                               unsigned(di.snk_block_h) });
+			const unsigned n_threads = opts
+			        ? pixpat::validate_thread_count(opts->num_threads)
+			        : pixpat::default_thread_count();
+			const pixpat::ColorSpec spec = opts
+			        ? pixpat::parse_spec(opts->rec, opts->range)
+			        : pixpat::kDefaultColorSpec;
+
+			pixpat::run_stripes(src->height, vs, n_threads,
+			                    [&](size_t y0, size_t y1) {
+					pixpat::dispatch_convert(src_id, dst_id, src, dst,
+					                         src->width, src->height,
+					                         y0, y1, spec);
+				});
+			return 0;
+		} catch (const std::exception&) {
+			return -1;
+		}
+	} else {
+		(void)dst;
+		(void)src;
+		(void)opts;
+		return -1;
+	}
+}
+
+PIXPAT_API int pixpat_format_supported(const char* format)
+{
+	auto id = pixpat::lookup_format(format);
+	if (id == pixpat::FormatId::Unknown)
+		return 0;
+	return pixpat::s_format_caps[size_t(id)].enabled() ? 1 : 0;
+}
+
+PIXPAT_API size_t pixpat_format_count(void)
+{
+	size_t n = 0;
+	for (const auto& c : pixpat::s_format_caps)
+		if (c.enabled())
+			++n;
+	return n;
+}
+
+PIXPAT_API const char* pixpat_format_name(size_t idx)
+{
+	size_t n = 0;
+	for (size_t i = 0; i < pixpat::s_format_catalog_count; ++i) {
+		if (!pixpat::s_format_caps[i].enabled())
+			continue;
+		if (n++ == idx)
+			return pixpat::s_format_table[i].name;
+	}
+	return nullptr;
+}
+
+} // extern "C"
diff --git a/subprojects/pixpat/pixpat-native/src/pixpat_convert.cpp b/subprojects/pixpat/pixpat-native/src/pixpat_convert.cpp
new file mode 100644
index 0000000..63461d8
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pixpat_convert.cpp
@@ -0,0 +1,201 @@
+// Convert-feature TU: built only when PIXPAT_FEATURE_CONVERT is on
+// (controlled by the meson source list). pixpat.cpp's pixpat_convert
+// entry calls into dispatch_convert() below via if-constexpr; when the
+// feature is off this file isn't compiled, the discarded if-constexpr
+// branch emits no symbol reference, and the .so simply lacks these
+// symbols.
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "color.h"
+#include "error.h"
+#include "format_catalog.h"
+#include "formats.h"
+#include "io.h"
+#include "layout.h"
+#include "pattern.h"
+#include "pipeline.h"
+#include "pixpat_internal.h"
+
+namespace pixpat
+{
+
+template <typename Src, typename Snk>
+static void run_convert_impl(const pixpat_buffer* src, const pixpat_buffer* dst,
+                             size_t W, size_t H,
+                             size_t by_start, size_t by_end,
+                             ColorSpec spec)
+{
+	using SL = typename Src::Layout;
+	using DL = typename Snk::Layout;
+	// Entry point (pixpat_convert) validates W/H against each layout's
+	// h_sub / v_sub, plus the sink's block dims.
+	assert(W % SL::h_sub == 0 && W % DL::h_sub == 0);
+	assert(H % SL::v_sub == 0 && H % DL::v_sub == 0);
+
+	auto sb = make_buffer<SL>(src);
+	auto db = make_buffer<DL>(dst);
+	Converter<Src, Snk>::run(sb, db, W, H, by_start, by_end, spec);
+}
+
+static void run_norm(FormatId src_id, FormatId dst_id,
+                     const pixpat_buffer* src, const pixpat_buffer* dst,
+                     size_t W, size_t H,
+                     size_t by_start, size_t by_end,
+                     ColorSpec spec)
+{
+	const auto& si = s_format_info[size_t(src_id)];
+	const auto& di = s_format_info[size_t(dst_id)];
+
+	const size_t bh = di.snk_block_h;
+	// Entry point (pixpat_convert) guarantees W/H alignment to each
+	// of si.h_sub / si.v_sub and di.snk_block_w / di.snk_block_h.
+	assert(W % si.h_sub == 0 && W % di.snk_block_w == 0);
+	assert(H % si.v_sub == 0 && H % bh == 0);
+
+	// Per-thread normalized line buffer. RGB16 and YUV16 are both 8
+	// bytes, so one allocation works for both. thread_local gives each
+	// worker its own buffer when called from run_stripes.
+	thread_local std::vector<uint8_t> norm;
+	norm.resize(bh * W * sizeof(RGB16));
+
+	const ColorCoeffs c = coeffs_for(spec);
+	for (size_t by = by_start; by < by_end; by += bh) {
+		si.unpack(norm.data(), src, by, bh, W);
+		if (si.kind != di.kind) {
+			const size_t n = bh * W;
+			if (si.kind == ColorKind::RGB)
+				norm_rgb_to_yuv(norm.data(), n, c);
+			else
+				norm_yuv_to_rgb(norm.data(), n, c);
+		}
+		di.pack(dst, norm.data(), by, W);
+	}
+}
+
+// Generated: FormatCaps + s_format_caps[] (per-format readable/writable
+// + hot_src/hot_dst), plus s_pattern_* / DefaultPattern.
+#include "pixpat_caps.inc"
+
+// Per-Src dispatch: pick the right Sink for `dst_id` and call
+// run_convert_impl. The X-macro emits one case per catalog format;
+// `if constexpr (...writable)` discards the body for non-writable
+// formats — those cases fall to the trailing throw.
+template <typename Src>
+static void dispatch_dst_convert(FormatId dst_id,
+                                 const pixpat_buffer* src, const pixpat_buffer* dst,
+                                 size_t W, size_t H,
+                                 size_t by_start, size_t by_end,
+                                 ColorSpec spec)
+{
+	switch (dst_id) {
+#define CAPS(name) s_format_caps[size_t(FormatId::name)]
+#define X(name)                                                          \
+	case FormatId::name:                                             \
+		if constexpr (CAPS(name).writable) {                     \
+			run_convert_impl<Src, formats::name::Sink>(      \
+				src, dst, W, H, by_start, by_end, spec); \
+			return;                                          \
+		}                                                        \
+		break;
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+#undef CAPS
+	default:
+		break;
+	}
+	throw invalid_argument("destination format not enabled in this build");
+}
+
+// Per-Snk dispatch: mirror of dispatch_dst_convert.
+template <typename Snk>
+static void dispatch_src_convert(FormatId src_id,
+                                 const pixpat_buffer* src, const pixpat_buffer* dst,
+                                 size_t W, size_t H,
+                                 size_t by_start, size_t by_end,
+                                 ColorSpec spec)
+{
+	switch (src_id) {
+#define CAPS(name) s_format_caps[size_t(FormatId::name)]
+#define X(name)                                                          \
+	case FormatId::name:                                             \
+		if constexpr (CAPS(name).readable) {                     \
+			run_convert_impl<formats::name::Source, Snk>(    \
+				src, dst, W, H, by_start, by_end, spec); \
+			return;                                          \
+		}                                                        \
+		break;
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+#undef CAPS
+	default:
+		break;
+	}
+	throw invalid_argument("source format not enabled in this build");
+}
+
+// Hot-pivot probes. The wrapper has to be a template so that the
+// discarded `if constexpr` branch is not instantiated — otherwise
+// dispatch_dst_convert<formats::X::Source> would be instantiated for
+// every catalog format, not just hot pivots.
+template <bool HotSrc, FormatId Id, typename Source>
+static bool try_hot_src(FormatId src_id, FormatId dst_id,
+                        const pixpat_buffer* src, const pixpat_buffer* dst,
+                        size_t W, size_t H,
+                        size_t by_start, size_t by_end,
+                        ColorSpec spec)
+{
+	if constexpr (HotSrc) {
+		if (src_id == Id) {
+			dispatch_dst_convert<Source>(
+				dst_id, src, dst, W, H, by_start, by_end, spec);
+			return true;
+		}
+	}
+	return false;
+}
+
+template <bool HotDst, FormatId Id, typename Sink>
+static bool try_hot_dst(FormatId src_id, FormatId dst_id,
+                        const pixpat_buffer* src, const pixpat_buffer* dst,
+                        size_t W, size_t H,
+                        size_t by_start, size_t by_end,
+                        ColorSpec spec)
+{
+	if constexpr (HotDst) {
+		if (dst_id == Id) {
+			dispatch_src_convert<Sink>(
+				src_id, src, dst, W, H, by_start, by_end, spec);
+			return true;
+		}
+	}
+	return false;
+}
+
+void dispatch_convert(FormatId src_id, FormatId dst_id,
+                      const pixpat_buffer* src, const pixpat_buffer* dst,
+                      size_t W, size_t H,
+                      size_t by_start, size_t by_end,
+                      ColorSpec spec)
+{
+#define CAPS(name) s_format_caps[size_t(FormatId::name)]
+#define X(name)                                                              \
+	if (try_hot_src<CAPS(name).hot_src, FormatId::name,                  \
+			formats::name::Source>(                              \
+		    src_id, dst_id, src, dst, W, H, by_start, by_end, spec)) \
+	return;                                                              \
+	if (try_hot_dst<CAPS(name).hot_dst, FormatId::name,                  \
+			formats::name::Sink>(                                \
+		    src_id, dst_id, src, dst, W, H, by_start, by_end, spec)) \
+	return;
+	PIXPAT_FORMAT_LIST(X)
+#undef X
+#undef CAPS
+
+	run_norm(src_id, dst_id, src, dst, W, H, by_start, by_end, spec);
+}
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/pixpat_internal.h b/subprojects/pixpat/pixpat-native/src/pixpat_internal.h
new file mode 100644
index 0000000..50d3405
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pixpat_internal.h
@@ -0,0 +1,89 @@
+#pragma once
+
+// Internal interface shared between the always-built pixpat.cpp and the
+// optional pixpat_pattern.cpp / pixpat_convert.cpp TUs. The feature
+// gate is meson-side: pixpat_pattern.cpp is in the source list iff
+// PIXPAT_FEATURE_PATTERN, and likewise for convert. The bridge
+// declarations below are unconditional; pixpat.cpp's entry points call
+// them inside `if constexpr (kFeatureXxx)`, and the discarded branch
+// emits no symbol reference, so absent definitions don't cause link
+// failures.
+
+#include <cstddef>
+#include <cstdint>
+
+#include <pixpat/pixpat.h>
+
+#include "color.h"
+#include "format_catalog.h"
+#include "layout.h"
+#include "pattern_catalog.h"
+
+namespace pixpat
+{
+
+template <typename Layout>
+inline Buffer<Layout::num_planes> make_buffer(const pixpat_buffer* b) noexcept
+{
+	Buffer<Layout::num_planes> out{};
+	for (size_t i = 0; i < Layout::num_planes; ++i) {
+		out.data[i] = static_cast<uint8_t*>(b->planes[i]);
+		out.stride[i] = b->strides[i];
+	}
+	return out;
+}
+
+using UnpackFn = void (*)(uint8_t*, const pixpat_buffer*, size_t, size_t, size_t);
+using PackFn   = void (*)(const pixpat_buffer*, const uint8_t*, size_t, size_t);
+
+struct FormatInfo {
+	UnpackFn unpack;
+	PackFn pack;
+	ColorKind kind;
+	uint8_t h_sub;
+	uint8_t v_sub;
+	uint8_t snk_block_h;
+	uint8_t snk_block_w;
+};
+
+extern const FormatInfo s_format_info[];
+
+// Per-format build capabilities. Defined once per build by the
+// generator into s_format_caps[] (in pixpat_caps.inc); the schema is
+// here so that file is pure data.
+struct FormatCaps {
+	bool readable;
+	bool writable;
+	bool hot_src;
+	bool hot_dst;
+
+	constexpr bool enabled() const noexcept
+	{
+		return readable || writable;
+	}
+};
+
+// Per-pattern build capabilities. Generator emits s_pattern_caps[]
+// indexed by PatternId, plus a separate s_default_pattern_id singleton
+// (the fallback when pattern_name doesn't match any enabled arm).
+// Used only when PIXPAT_FEATURE_PATTERN — pixpat_pattern.cpp consumes
+// both.
+struct PatternCaps {
+	bool enabled;
+};
+
+class Params;
+
+// Bridge into pixpat_pattern.cpp (defined there iff PIXPAT_FEATURE_PATTERN).
+void dispatch_draw_pattern(FormatId id, const char* pattern_name,
+                           const Params& params,
+                           const pixpat_buffer* dst, size_t W, size_t H,
+                           size_t by_start, size_t by_end, ColorSpec spec);
+
+// Bridge into pixpat_convert.cpp (defined there iff PIXPAT_FEATURE_CONVERT).
+void dispatch_convert(FormatId src_id, FormatId dst_id,
+                      const pixpat_buffer* src, const pixpat_buffer* dst,
+                      size_t W, size_t H,
+                      size_t by_start, size_t by_end, ColorSpec spec);
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/pixpat_pattern.cpp b/subprojects/pixpat/pixpat-native/src/pixpat_pattern.cpp
new file mode 100644
index 0000000..e8ac780
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/pixpat_pattern.cpp
@@ -0,0 +1,168 @@
+// Pattern-feature TU: built only when PIXPAT_FEATURE_PATTERN is on
+// (controlled by the meson source list). pixpat.cpp's pixpat_draw_pattern
+// entry calls into dispatch_draw_pattern() below via if-constexpr; when
+// the feature is off this file isn't compiled, the discarded if-constexpr
+// branch emits no symbol reference, and the .so simply lacks these
+// symbols.
+
+#include <cassert>
+#include <cstdint>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+
+#include "color.h"
+#include "error.h"
+#include "params.h"
+#include "pattern.h"
+#include "pattern_catalog.h"
+#include "pipeline.h"
+#include "pixpat_internal.h"
+
+namespace pixpat
+{
+
+// Generated: s_pattern_* enable flags + DefaultPattern alias. Included
+// inside namespace pixpat so the unqualified FormatId / s_format_catalog_count
+// references resolve.
+#include "pixpat_caps.inc"
+
+// Cold pattern path: fill a per-thread normalized line buffer with
+// Pattern samples in the pattern's native color kind, run a cross-
+// color-kind pass over the buffer if the sink wants the other kind,
+// then hand the buffer to the destination's per-format pack via
+// s_format_info. Same shape as run_norm in pixpat_convert.cpp.
+template <typename Pattern>
+static void run_pattern_norm(const Pattern& pat,
+                             FormatId dst_id, const pixpat_buffer* dst,
+                             size_t W, size_t H,
+                             size_t by_start, size_t by_end,
+                             ColorSpec spec)
+{
+	using P = typename Pattern::Pixel;
+	constexpr bool pat_is_rgb = std::is_same_v<P, RGB16>;
+
+	const auto& di = s_format_info[size_t(dst_id)];
+	const size_t bh = di.snk_block_h;
+	// Entry point (pixpat_draw_pattern) validates W%bw / H%bh.
+	assert(W % di.snk_block_w == 0 && H % bh == 0);
+
+	thread_local std::vector<uint8_t> norm;
+	norm.resize(bh * W * sizeof(RGB16));   // RGB16 / YUV16 same size
+
+	const ColorCoeffs c = coeffs_for(spec);
+	const bool need_xfm = (pat_is_rgb && di.kind == ColorKind::YUV) ||
+	                      (!pat_is_rgb && di.kind == ColorKind::RGB);
+
+	for (size_t by = by_start; by < by_end; by += bh) {
+		auto* px = reinterpret_cast<P*>(norm.data());
+		for (size_t dy = 0; dy < bh; ++dy)
+			for (size_t x = 0; x < W; ++x)
+				px[dy * W + x] = pat.sample(x, by + dy, W, H);
+		if (need_xfm) {
+			const size_t n = bh * W;
+			if constexpr (pat_is_rgb)
+				norm_rgb_to_yuv(norm.data(), n, c);
+			else
+				norm_yuv_to_rgb(norm.data(), n, c);
+		}
+		di.pack(dst, norm.data(), by, W);
+	}
+}
+
+// Construct, ready-check, and run a pattern. Patterns whose colors
+// depend on the call's ColorSpec (e.g. native-YUV bar variants) opt
+// in by exposing a (Params, ColorSpec) constructor; the rest take
+// Params only and stay unchanged.
+template <typename Pattern>
+static void run_one_pattern(const Params& params,
+                            FormatId id, const pixpat_buffer* dst,
+                            size_t W, size_t H,
+                            size_t by_start, size_t by_end,
+                            ColorSpec spec)
+{
+	auto pat = [&] {
+			   if constexpr (std::is_constructible_v<
+						 Pattern, const Params&, ColorSpec>)
+				   return Pattern(params, spec);
+			   else
+				   return Pattern(params);
+		   }();
+	if constexpr (requires { pat.ready(); }) {
+		if (!pat.ready())
+			throw invalid_argument("pattern parameters not accepted");
+	}
+	run_pattern_norm(pat, id, dst, W, H, by_start, by_end, spec);
+}
+
+// Per-pattern dispatch arm. Templated on the catalog row's RGB and
+// YUV variants (either may be `void` if the pattern has no variant
+// in that kind). When both are present, the sink kind picks the
+// matching variant so the cross-kind pass is a no-op; when only one
+// is present, the pipeline runs the cross-kind pass for opposite-
+// kind sinks.
+//
+// Wrapping in a templated helper is what keeps the binary size down:
+// `if constexpr (Enabled = false)` discards the run_pattern_norm
+// reference, and because try_pattern is itself a template, the
+// discarded branch is *not instantiated* — so disabled patterns
+// emit no code, and the `void` arms of partial patterns never
+// instantiate `Pattern::Pixel` or run_pattern_norm<void>.
+template <bool Enabled, typename Rgb, typename Yuv>
+static bool try_pattern(std::string_view name, std::string_view want,
+                        const Params& params,
+                        FormatId id, ColorKind sink_kind,
+                        const pixpat_buffer* dst,
+                        size_t W, size_t H,
+                        size_t by_start, size_t by_end,
+                        ColorSpec spec)
+{
+	if constexpr (Enabled) {
+		if (name == want) {
+			constexpr bool has_rgb = !std::is_void_v<Rgb>;
+			constexpr bool has_yuv = !std::is_void_v<Yuv>;
+			static_assert(has_rgb || has_yuv,
+			              "pattern needs at least one variant");
+			if constexpr (has_rgb && has_yuv) {
+				if (sink_kind == ColorKind::YUV)
+					run_one_pattern<Yuv>(params, id, dst, W, H,
+					                     by_start, by_end, spec);
+				else
+					run_one_pattern<Rgb>(params, id, dst, W, H,
+					                     by_start, by_end, spec);
+			} else if constexpr (has_rgb) {
+				run_one_pattern<Rgb>(params, id, dst, W, H,
+				                     by_start, by_end, spec);
+			} else {
+				run_one_pattern<Yuv>(params, id, dst, W, H,
+				                     by_start, by_end, spec);
+			}
+			return true;
+		}
+	}
+	return false;
+}
+
+void dispatch_draw_pattern(FormatId id, const char* pattern_name,
+                           const Params& params,
+                           const pixpat_buffer* dst,
+                           size_t W, size_t H,
+                           size_t by_start, size_t by_end,
+                           ColorSpec spec)
+{
+	using namespace patterns;
+	// NULL pattern_name selects the default ("kmstest"); see pixpat.h.
+	const std::string_view name = pattern_name ? pattern_name : "kmstest";
+	const ColorKind kind = s_format_info[size_t(id)].kind;
+
+#define X(label, rgb, yuv, str)                                                      \
+	if (try_pattern<s_pattern_caps[size_t(PatternId::label)].enabled, rgb, yuv>( \
+		    name, str, params, id, kind, dst, W, H, by_start, by_end, spec)) \
+	return;
+	PIXPAT_PATTERN_LIST(X)
+#undef X
+
+	throw invalid_argument("unknown or disabled pattern name");
+}
+
+} // namespace pixpat
diff --git a/subprojects/pixpat/pixpat-native/src/threading.h b/subprojects/pixpat/pixpat-native/src/threading.h
new file mode 100644
index 0000000..5e7fc01
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/threading.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <thread>
+#include <vector>
+
+namespace pixpat
+{
+
+inline unsigned default_thread_count()
+{
+	long n = sysconf(_SC_NPROCESSORS_ONLN);
+	if (n < 1)
+		return 1;
+	// Cap to keep per-stripe work meaningful and avoid heavy
+	// oversubscription on large NUMA hosts.
+	if (n > 16)
+		n = 16;
+	return static_cast<unsigned>(n);
+}
+
+/*
+ * Run `fn(start_y, end_y)` over `[0, height)` partitioned into stripes
+ * aligned to `v_sub`. Half-open ranges, matching the `for (by = 0;
+ * by < H; by += bh)` block-loop style.
+ *
+ * `fn` must be callable as `void(size_t start_y, size_t end_y)` and is
+ * invoked concurrently from multiple threads — it must be safe to call
+ * with disjoint Y-ranges in parallel. Exceptions thrown from a worker
+ * are captured and the first (by stripe index) is rethrown after all
+ * workers join.
+ *
+ * When `n_threads <= 1`, `fn` is called inline on the calling thread —
+ * no `std::thread` is spawned, no allocation occurs.
+ */
+template<typename F>
+void run_stripes(size_t height, unsigned v_sub, unsigned n_threads, F&& fn)
+{
+	if (height == 0 || v_sub == 0)
+		return;
+
+	// Callers (pixpat_convert / pixpat_draw_pattern) validate divisibility
+	// at the entry point.
+	assert(height % v_sub == 0);
+
+	const size_t max_useful = height / v_sub;
+	if (n_threads == 0)
+		n_threads = 1;
+	if (static_cast<size_t>(n_threads) > max_useful)
+		n_threads = static_cast<unsigned>(max_useful);
+
+	if (n_threads <= 1) {
+		fn(size_t{ 0 }, height);
+		return;
+	}
+
+	// Stripe height rounded up to v_sub; last stripe absorbs the
+	// remainder.
+	size_t part_height = (height + n_threads - 1) / n_threads;
+	part_height = (part_height + v_sub - 1) / v_sub * v_sub;
+
+	std::vector<std::exception_ptr> errors(n_threads);
+	std::vector<std::thread> workers;
+	workers.reserve(n_threads);
+
+	for (unsigned i = 0; i < n_threads; i++) {
+		size_t start = i * part_height;
+		if (start >= height)
+			break;
+		size_t end = start + part_height;
+		if (i == n_threads - 1 || end > height)
+			end = height;
+
+		workers.emplace_back([&, i, start, end] {
+				try {
+					fn(start, end);
+				} catch (...) {
+					errors[i] = std::current_exception();
+				}
+			});
+	}
+
+	for (auto& t : workers)
+		t.join();
+
+	for (auto& e : errors)
+		if (e)
+			std::rethrow_exception(e);
+}
+
+} // namespace pixpat