1 files changed, 242 insertions, 0 deletions
diff --git a/subprojects/pixpat/pixpat-native/src/io/semiplanar.h b/subprojects/pixpat/pixpat-native/src/io/semiplanar.h
new file mode 100644
index 0000000..00e7731
--- /dev/null
+++ b/subprojects/pixpat/pixpat-native/src/io/semiplanar.h
@@ -0,0 +1,242 @@
+#pragma once
+
+// 2-plane semiplanar YUV. Two flavours:
+//
+//   SemiplanarSource / SemiplanarSink — NV12/NV21/NV16/NV61, single
+//     pixel per Y storage word, single chroma pair per chroma word.
+//
+//   MultiPixelSemiplanarSource / MultiPixelSemiplanarSink — P030/P230,
+//     multiple Y pixels per Y word and multiple chroma pairs per
+//     chroma word. The Y plane has `ppw_y = component_count<Y>()` Y
+//     samples per storage word; the chroma plane has `pairs =
+//     component_count<U>()` U/V pairs per storage word. block_w =
+//     pairs × h_sub, block_h = v_sub — each block exactly fills one
+//     chroma word.
+
+#include <array>
+
+#include "../layout.h"
+#include "detail.h"
+
+namespace pixpat
+{
+
+template <typename L>
+struct SemiplanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t y_idx = YP::template find_pos<C::Y>();
+	static constexpr size_t u_idx = CP::template find_pos<C::U>();
+	static constexpr size_t v_idx = CP::template find_pos<C::V>();
+
+	static YUV16 read(const Buffer<2>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		const uint8_t* yp = buf.data[0] + y * buf.stride[0] + x * YP::bytes_per_pixel;
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+
+		const size_t cx = x / L::h_sub;
+		const size_t cy = y / L::v_sub;
+		const uint8_t* cp = buf.data[1] + cy * buf.stride[1] + cx * CP::bytes_per_pixel;
+		const auto c_vals = CP::unpack(detail::load_word<CP>(cp));
+
+		return YUV16{
+		        detail::decode_norm(YP::comps[y_idx].bits, y_vals[y_idx]),
+		        detail::decode_norm(CP::comps[u_idx].bits, c_vals[u_idx]),
+		        detail::decode_norm(CP::comps[v_idx].bits, c_vals[v_idx]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct SemiplanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t y_idx = YP::template find_pos<C::Y>();
+	static constexpr size_t u_idx = CP::template find_pos<C::U>();
+	static constexpr size_t v_idx = CP::template find_pos<C::V>();
+
+	static constexpr size_t block_h = L::v_sub;
+	static constexpr size_t block_w = L::h_sub;
+
+	static void write_block(Buffer<2>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[block_h][block_w]) noexcept
+	{
+		// Y per pixel.
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			uint8_t* y_row = buf.data[0] + (by + dy) * buf.stride[0];
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				std::array<uint16_t, YP::num_comps> v{};
+				v[y_idx] = detail::encode_norm(YP::comps[y_idx].bits,
+				                               block[dy][dx].y);
+				detail::store_word<YP>(
+					y_row + (bx + dx) * YP::bytes_per_pixel,
+					YP::pack(v));
+			}
+		}
+
+		// One averaged UV pair for the whole block. Integer truncation
+		// (no round-half-up).
+		uint32_t u_sum = 0, v_sum = 0;
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			for (size_t dx = 0; dx < block_w; ++dx) {
+				u_sum += block[dy][dx].u;
+				v_sum += block[dy][dx].v;
+			}
+		}
+		constexpr uint32_t n = block_h * block_w;
+		const uint16_t u_avg = uint16_t(u_sum / n);
+		const uint16_t v_avg = uint16_t(v_sum / n);
+
+		std::array<uint16_t, CP::num_comps> uv{};
+		uv[u_idx] = detail::encode_norm(CP::comps[u_idx].bits, u_avg);
+		uv[v_idx] = detail::encode_norm(CP::comps[v_idx].bits, v_avg);
+
+		const size_t cx = bx / L::h_sub;
+		const size_t cy = by / L::v_sub;
+		uint8_t* cp = buf.data[1] + cy * buf.stride[1] + cx * CP::bytes_per_pixel;
+		detail::store_word<CP>(cp, CP::pack(uv));
+	}
+};
+
+// Multi-pixel-per-word semiplanar (P030: 4:2:0, P230: 4:2:2). All Y
+// components share the same bit width; same for U and V.
+template <typename L>
+struct MultiPixelSemiplanarSource {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t ppw_y = YP::template component_count<C::Y>();
+	static constexpr size_t pairs = CP::template component_count<C::U>();
+	static_assert(ppw_y >= 1 && pairs >= 1);
+	static_assert(pairs == CP::template component_count<C::V>());
+
+	// All same-tag positions share the same bit width.
+	static constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+	static constexpr unsigned u_bits = CP::comps[CP::template find_pos<C::U>(0)].bits;
+	static constexpr unsigned v_bits = CP::comps[CP::template find_pos<C::V>(0)].bits;
+
+	static YUV16 read(const Buffer<2>& buf, size_t x, size_t y,
+	                  [[maybe_unused]] size_t W,
+	                  [[maybe_unused]] size_t H) noexcept
+	{
+		// Y read.
+		const size_t y_gx  = x / ppw_y;
+		const size_t y_off = x % ppw_y;
+		const uint8_t* yp = buf.data[0] + y * buf.stride[0]
+		                    + y_gx * YP::bytes_per_pixel;
+		const auto y_vals = YP::unpack(detail::load_word<YP>(yp));
+
+		// Chroma read.
+		const size_t cx    = x / L::h_sub;
+		const size_t cy    = y / L::v_sub;
+		const size_t c_gx  = cx / pairs;
+		const size_t c_off = cx % pairs;
+		const uint8_t* cp = buf.data[1] + cy * buf.stride[1]
+		                    + c_gx * CP::bytes_per_pixel;
+		const auto c_vals = CP::unpack(detail::load_word<CP>(cp));
+
+		return YUV16{
+		        detail::decode_norm(y_bits, y_vals[YP::template find_pos<C::Y>(y_off)]),
+		        detail::decode_norm(u_bits, c_vals[CP::template find_pos<C::U>(c_off)]),
+		        detail::decode_norm(v_bits, c_vals[CP::template find_pos<C::V>(c_off)]),
+		        uint16_t(0),
+		};
+	}
+};
+
+template <typename L>
+struct MultiPixelSemiplanarSink {
+	using Layout = L;
+	using Pixel  = YUV16;
+
+	static_assert(L::kind == ColorKind::YUV);
+	static_assert(L::num_planes == 2);
+
+	using YP = typename L::template plane<0>;
+	using CP = typename L::template plane<1>;
+	static constexpr size_t ppw_y = YP::template component_count<C::Y>();
+	static constexpr size_t pairs = CP::template component_count<C::U>();
+	static_assert(ppw_y >= 1 && pairs >= 1);
+
+	// One block exactly fills one chroma word: `pairs` chroma pairs,
+	// each covering h_sub luma columns × v_sub rows.
+	static constexpr size_t block_w = pairs * L::h_sub;
+	static constexpr size_t block_h = L::v_sub;
+	static_assert(block_w % ppw_y == 0,
+	              "block width must be a multiple of Y-pixels-per-word");
+	static constexpr size_t y_words_per_row = block_w / ppw_y;
+
+	// All same-tag positions share the same bit width.
+	static constexpr unsigned y_bits = YP::comps[YP::template find_pos<C::Y>(0)].bits;
+	static constexpr unsigned u_bits = CP::comps[CP::template find_pos<C::U>(0)].bits;
+	static constexpr unsigned v_bits = CP::comps[CP::template find_pos<C::V>(0)].bits;
+
+	static void write_block(Buffer<2>& buf, size_t bx, size_t by,
+	                        const YUV16 (&block)[block_h][block_w]) noexcept
+	{
+		// Y plane: y_words_per_row Y-words per row, block_h rows.
+		for (size_t dy = 0; dy < block_h; ++dy) {
+			uint8_t* y_row = buf.data[0]
+			                 + (by + dy) * buf.stride[0];
+			for (size_t w = 0; w < y_words_per_row; ++w) {
+				std::array<uint16_t, YP::num_comps> v{};
+				for (size_t i = 0; i < ppw_y; ++i) {
+					const size_t pos = YP::template find_pos<C::Y>(i);
+					v[pos] = detail::encode_norm(
+						y_bits, block[dy][w * ppw_y + i].y);
+				}
+				detail::store_word<YP>(
+					y_row + (bx / ppw_y + w)
+					* YP::bytes_per_pixel,
+					YP::pack(v));
+			}
+		}
+
+		// One UV-word: `pairs` chroma pairs. Each pair averages h_sub
+		// horizontally × v_sub vertically luma values.
+		std::array<uint16_t, CP::num_comps> uv{};
+		constexpr uint32_t n = L::h_sub * L::v_sub;
+		for (size_t p = 0; p < pairs; ++p) {
+			uint32_t u_sum = 0, v_sum = 0;
+			for (size_t dy = 0; dy < block_h; ++dy) {
+				for (size_t dx = 0; dx < L::h_sub; ++dx) {
+					u_sum += block[dy][p * L::h_sub + dx].u;
+					v_sum += block[dy][p * L::h_sub + dx].v;
+				}
+			}
+			uv[CP::template find_pos<C::U>(p)] =
+				detail::encode_norm(u_bits, uint16_t(u_sum / n));
+			uv[CP::template find_pos<C::V>(p)] =
+				detail::encode_norm(v_bits, uint16_t(v_sum / n));
+		}
+
+		const size_t cy = by / L::v_sub;
+		const size_t uv_word_idx = bx / block_w;
+		detail::store_word<CP>(
+			buf.data[1] + cy * buf.stride[1]
+			+ uv_word_idx * CP::bytes_per_pixel,
+			CP::pack(uv));
+	}
+};
+
+} // namespace pixpat