From ff87b61e5819eeaa4a29a774c1d3d39a83f56adf Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:07:38 +0200 Subject: [PATCH 1/8] PDF: decode reverse-solidus escapes in string literals The literal-string parser appended the raw character after a backslash for non-octal escapes, so `\n \r \t \b \f` yielded the letters 'n' 'r' 't' 'b' 'f' instead of their control bytes (ISO 32000-1 7.3.4.2, Table 3). Any glyph whose byte code had one of those low bytes (common in 2-byte CFF CID codes) decoded to the wrong code, producing .notdef boxes and garbled selectable text. Translate the full escape set and handle `\`-before-EOL line continuation. Covered by new PdfObjectParser string tests. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/AGENTS.md | 13 +++--- src/odr/internal/pdf/pdf_object_parser.cpp | 33 +++++++++++++- test/src/internal/pdf/pdf_object_parser.cpp | 48 +++++++++++++++++++++ 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 3d80eeaa..bb3e3c2c 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -31,8 +31,10 @@ inline SVG per page. Experimental and not production-quality. (`DecoderEngine::odr`); `is_decodable()` returns `false` and `file_meta()` carries only the file type. All parsing is lazy, on HTML request. - **Object syntax**: null, booleans, integers/reals, names (incl. `#xx` - escapes), literal strings (`\` and `\ooo` escapes), hex strings, arrays, - dictionaries, indirect references (`n g R`) — standalone and nested. + escapes), literal strings (the `\n \r \t \b \f` control escapes, `\ddd` + octal, escaped delimiters, and `\`-before-EOL line continuation — Table 3), + hex strings, arrays, dictionaries, indirect references (`n g R`) — + standalone and nested. - **File structure**: header, `n g obj … endobj`, `stream` payloads (via `/Length`, with a scan-to-`endstream` fallback), classic `xref` tables, `trailer`, `startxref`, `%%EOF`; both sequential reading (`read_entry`) and @@ -565,9 +567,10 @@ the tables land. there is no `/ToUnicode` (with an unreachable glyph staying unmapped) and a `/ToUnicode` CMap taking precedence over the reverse map. -No assertion-based coverage of the tokenizer (escapes, references, hex strings) -or the HTML output itself (the span emission / CSS transform mapping, incl. the -dual-layer glyph/Unicode emission). +The tokenizer's string parsing is covered (`PdfObjectParser`: literal-string +control/octal/delimiter/line-continuation escapes and hex strings); references +and the HTML output itself (the span emission / CSS transform mapping, incl. the +dual-layer glyph/Unicode emission) are not yet asserted. --- diff --git a/src/odr/internal/pdf/pdf_object_parser.cpp b/src/odr/internal/pdf/pdf_object_parser.cpp index 357cf5ea..924ae24b 100644 --- a/src/odr/internal/pdf/pdf_object_parser.cpp +++ b/src/odr/internal/pdf/pdf_object_parser.cpp @@ -368,7 +368,38 @@ std::variant ObjectParser::read_string() { string += three_octet_to_char(octet[0], octet[1], octet[2]); } else { bumpc(); - string += c; + // Reverse-solidus escapes (ISO 32000-1 7.3.4.2, Table 3): the control + // escapes translate to their byte value, a backslash before an + // end-of-line marker is a line continuation (both elided), and any + // other escaped character (including `(`, `)`, `\`) stands for + // itself. + switch (c) { + case 'n': + string += '\n'; + break; + case 'r': + string += '\r'; + break; + case 't': + string += '\t'; + break; + case 'b': + string += '\b'; + break; + case 'f': + string += '\f'; + break; + case '\n': + break; + case '\r': + if (geti() == '\n') { + bumpc(); + } + break; + default: + string += c; + break; + } } continue; } diff --git a/test/src/internal/pdf/pdf_object_parser.cpp b/test/src/internal/pdf/pdf_object_parser.cpp index 8d56fc85..25b7f5e7 100644 --- a/test/src/internal/pdf/pdf_object_parser.cpp +++ b/test/src/internal/pdf/pdf_object_parser.cpp @@ -16,6 +16,12 @@ std::string read_hex_string(const std::string &input) { return std::get(parser.read_string()).string; } +std::string read_standard_string(const std::string &input) { + std::istringstream in(input); + ObjectParser parser(in); + return std::get(parser.read_string()).string; +} + Real read_number(const std::string &input) { std::istringstream in(input); ObjectParser parser(in); @@ -128,6 +134,48 @@ TEST(PdfObjectParser, skip_past) { Result(true, "X")); } +// 7.3.4.2: a literal string is the bytes between balanced parentheses. +TEST(PdfObjectParser, standard_string_basic) { + EXPECT_EQ(read_standard_string("(Hello)"), "Hello"); + EXPECT_EQ(read_standard_string("()"), ""); + EXPECT_EQ(read_standard_string("(a b\tc)"), "a b\tc"); +} + +// 7.3.4.2, Table 3: the control escapes translate to their byte value. A CFF +// CID font's code string carries e.g. `\b` for the byte 0x08, so failing to +// translate it corrupts the code (the encrypted_fontfile3_opentype regression). +TEST(PdfObjectParser, standard_string_control_escapes) { + EXPECT_EQ(read_standard_string("(a\\nb)"), "a\nb"); + EXPECT_EQ(read_standard_string("(a\\rb)"), "a\rb"); + EXPECT_EQ(read_standard_string("(a\\tb)"), "a\tb"); + EXPECT_EQ(read_standard_string("(a\\bb)"), "a\bb"); + EXPECT_EQ(read_standard_string("(a\\fb)"), "a\fb"); + // a two-byte code <00 08> written with the byte 0x08 escaped as `\b` + EXPECT_EQ(read_standard_string(std::string("(\000\\b)", 5)), + std::string("\000\b", 2)); +} + +// 7.3.4.2: an escaped `(`, `)` or `\` stands for itself, and any other escaped +// character keeps the character while dropping the backslash. +TEST(PdfObjectParser, standard_string_literal_escapes) { + EXPECT_EQ(read_standard_string("(a\\(b\\)c)"), "a(b)c"); + EXPECT_EQ(read_standard_string("(a\\\\b)"), "a\\b"); + EXPECT_EQ(read_standard_string("(a\\qb)"), "aqb"); +} + +// 7.3.4.2: a `\ddd` octal escape (1-3 digits) is the byte of that value. +TEST(PdfObjectParser, standard_string_octal_escape) { + EXPECT_EQ(read_standard_string("(\\101)"), "A"); + EXPECT_EQ(read_standard_string("(\\000)"), std::string("\000", 1)); +} + +// 7.3.4.2: a backslash before an end-of-line marker is a line continuation; +// both the backslash and the marker are dropped. +TEST(PdfObjectParser, standard_string_line_continuation) { + EXPECT_EQ(read_standard_string("(a\\\nb)"), "ab"); + EXPECT_EQ(read_standard_string("(a\\\r\nb)"), "ab"); +} + // 7.3.4.3: a hex string is the bytes of its hex digits, with whitespace ignored // and an odd final digit assumed to be followed by a 0. TEST(PdfObjectParser, hex_string_basic) { From 1ac93055b16c0b67c467fac73bcf07c94fee87cb Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:07:58 +0200 Subject: [PATCH 2/8] PDF: parse composite /Encoding CMap for CID mapping An embedded Type0 /Encoding CMap stream is now parsed (cidchar/cidrange -> Font::cid_encoding) so Font::codes() maps char codes to CIDs through it and uses the CMap's authoritative codespace. This fixes producers that mix a 1-byte code (e.g. a space) among 2-byte CIDs: without the CMap the space fell through identity to the wrong glyph and advance. CMap gains cid_for_code / add_cid_range / has_cid_map; the parser gains read_cidchar / read_cidrange. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/AGENTS.md | 6 +- src/odr/internal/pdf/pdf_cmap.cpp | 38 +++++- src/odr/internal/pdf/pdf_cmap.hpp | 51 ++++++++ src/odr/internal/pdf/pdf_cmap_parser.cpp | 56 +++++++++ src/odr/internal/pdf/pdf_cmap_parser.hpp | 2 + src/odr/internal/pdf/pdf_document_element.hpp | 119 ++++++++++++++---- src/odr/internal/pdf/pdf_document_parser.cpp | 19 ++- 7 files changed, 262 insertions(+), 29 deletions(-) diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index bb3e3c2c..08e90aef 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -105,7 +105,11 @@ inline SVG per page. Experimental and not production-quality. (Type0) fonts** are recognized: the descendant CIDFont's `/CIDSystemInfo` `/Registry`/`/Ordering` is recorded on the `Font`, and the Type0 `/Encoding` (a code → CID CMap such as `Identity-H`) is kept out of the - simple-font encoding path. Extraction is driven by the `/ToUnicode` CMap (the + simple-font encoding path. An *embedded* `/Encoding` CMap stream is parsed + (`cidchar`/`cidrange` → `Font::cid_encoding`) so `Font::codes()` yields CIDs + through it — this also carries the authoritative codespace, so a producer that + mixes a 1-byte code (e.g. a space) among 2-byte CIDs stays aligned and selects + the right glyph/advance. Extraction is driven by the `/ToUnicode` CMap (the common case — every Type0 font in the corpus carries one). When a composite font has no `/ToUnicode`, a **predefined Unicode `/Encoding`** — the `Uni*-UCS2/UTF16/UTF32` CMaps — is decoded directly (`pdf_cid`), since those diff --git a/src/odr/internal/pdf/pdf_cmap.cpp b/src/odr/internal/pdf/pdf_cmap.cpp index af03e080..aa14c682 100644 --- a/src/odr/internal/pdf/pdf_cmap.cpp +++ b/src/odr/internal/pdf/pdf_cmap.cpp @@ -15,9 +15,36 @@ void CMap::map_single(std::string code, std::u16string unicode) { m_map[std::move(code)] = std::move(unicode); } -std::size_t CMap::code_length(const std::string &codes, - const std::size_t pos) const { - const auto first = static_cast(codes[pos]); +void CMap::map_cid_char(std::string code, const std::uint32_t cid) { + m_cid_chars[std::move(code)] = cid; +} + +void CMap::add_cid_range(const std::uint32_t low, const std::uint32_t high, + const std::uint32_t base_cid, + const std::size_t width) { + m_cid_ranges.push_back({low, high, base_cid, width}); +} + +std::optional +CMap::cid_for_code(const std::string_view code) const { + if (const auto it = m_cid_chars.find(std::string(code)); + it != m_cid_chars.end()) { + return it->second; + } + std::uint32_t value = 0; + for (const char c : code) { + value = (value << 8) | static_cast(c); + } + for (const CidRange &range : m_cid_ranges) { + if (range.width == code.size() && value >= range.low && + value <= range.high) { + return range.base_cid + (value - range.low); + } + } + return std::nullopt; +} + +std::size_t CMap::code_width(const std::uint8_t first) const { for (const CodespaceRange &range : m_codespace_ranges) { if (first >= static_cast(range.low.front()) && first <= static_cast(range.high.front())) { @@ -30,6 +57,11 @@ std::size_t CMap::code_length(const std::string &codes, return 1; } +std::size_t CMap::code_length(const std::string &codes, + const std::size_t pos) const { + return code_width(static_cast(codes[pos])); +} + std::string CMap::translate_string(const std::string &codes) const { std::u16string result; diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp index 65ee351d..2dbc62bd 100644 --- a/src/odr/internal/pdf/pdf_cmap.hpp +++ b/src/odr/internal/pdf/pdf_cmap.hpp @@ -1,7 +1,10 @@ #pragma once #include +#include +#include #include +#include #include #include @@ -18,12 +21,51 @@ class CMap { void add_codespace_range(std::string low_code, std::string high_code); void map_single(std::string code, std::u16string unicode); + /// CID mapping (a composite font's `/Encoding` CMap stream, ISO 32000-1 + /// 9.7.5.3): `cidchar` maps a single code, `cidrange` maps a contiguous block + /// (`base_cid + (code - low)`). Codes are keyed by their raw big-endian bytes + /// / width so a 1-byte and a 2-byte code with the same numeric value stay + /// distinct (`<20>` -> CID 229 vs `<0020>` -> CID 32), the mixed-width case. + void map_cid_char(std::string code, std::uint32_t cid); + void add_cid_range(std::uint32_t low, std::uint32_t high, + std::uint32_t base_cid, std::size_t width); + /// True when no code -> Unicode mapping was parsed (e.g. the font carries no /// `ToUnicode` CMap); the caller then falls back to the `/Encoding`. [[nodiscard]] bool empty() const { return m_map.empty(); } + /// True when at least one codespace range was declared. When true, the code + /// widths this CMap implies are authoritative for splitting a code string + /// (see `code_width`); when false, callers fall back to a fixed width. + [[nodiscard]] bool has_codespace() const { + return !m_codespace_ranges.empty(); + } + + /// Byte width of a code whose first byte is `first`, decided by the codespace + /// ranges (matched on the first byte, ISO 32000-1 9.7.6.2). Falls back to a + /// single byte when no range declares/matches it. This is the variable-width + /// split that `translate_string` uses; exposing it lets the glyph/advance + /// paths split codes identically, so a mixed 1-/2-byte codespace (e.g. a + /// 1-byte space among 2-byte CIDs) stays aligned across both. + [[nodiscard]] std::size_t code_width(std::uint8_t first) const; + [[nodiscard]] std::string translate_string(const std::string &codes) const; + /// True when at least one `cidchar`/`cidrange` mapping was parsed (an + /// embedded CID `/Encoding` CMap). When false the composite code -> CID is + /// identity + /// (`Identity-H/V`). + [[nodiscard]] bool has_cid_map() const { + return !m_cid_chars.empty() || !m_cid_ranges.empty(); + } + + /// The CID a code (raw big-endian bytes) selects, or `nullopt` when no + /// `cidchar`/`cidrange` covers it; the caller then falls back to identity + /// (CID = code). The width of `code` is matched, so a 1-byte code never hits + /// a 2-byte range. + [[nodiscard]] std::optional + cid_for_code(std::string_view code) const; + private: struct CodespaceRange { // `low` and `high` share the same length; that length is the code width in @@ -32,8 +74,17 @@ class CMap { std::string high; }; + struct CidRange { + std::uint32_t low; // numeric value of the low code + std::uint32_t high; // numeric value of the high code + std::uint32_t base_cid; // CID of the low code + std::size_t width; // byte width of the codes (to disambiguate widths) + }; + std::vector m_codespace_ranges; std::unordered_map m_map; + std::unordered_map m_cid_chars; + std::vector m_cid_ranges; /// Byte width of the code starting at `pos`, decided by the codespace ranges; /// falls back to a single byte when no range declares/matches it. diff --git a/src/odr/internal/pdf/pdf_cmap_parser.cpp b/src/odr/internal/pdf/pdf_cmap_parser.cpp index 008d6768..24c01721 100644 --- a/src/odr/internal/pdf/pdf_cmap_parser.cpp +++ b/src/odr/internal/pdf/pdf_cmap_parser.cpp @@ -228,6 +228,58 @@ void CMapParser::read_bfrange(const std::uint32_t n, CMap &cmap) { } } +void CMapParser::read_cidchar(const std::uint32_t n, CMap &cmap) { + m_parser.skip_whitespace(); + for (std::uint32_t i = 0; i < n; ++i) { + std::string code = m_parser.read_object().as_string(); + m_parser.skip_whitespace(); + const Object cid = m_parser.read_object(); + m_parser.skip_whitespace(); + + if (!valid_code_width(code.size()) || !cid.is_integer()) { + ODR_WARNING(*m_logger, "pdf: skipping malformed cidchar entry (code " + << code.size() << " bytes)"); + continue; // skip a malformed mapping, keep the rest of the CMap + } + cmap.map_cid_char(std::move(code), + static_cast(cid.as_integer())); + } +} + +void CMapParser::read_cidrange(const std::uint32_t n, CMap &cmap) { + m_parser.skip_whitespace(); + for (std::uint32_t i = 0; i < n; ++i) { + std::string low = m_parser.read_object().as_string(); + m_parser.skip_whitespace(); + std::string high = m_parser.read_object().as_string(); + m_parser.skip_whitespace(); + const Object cid = m_parser.read_object(); + m_parser.skip_whitespace(); + + if (low.size() != high.size() || !valid_code_width(low.size()) || + !cid.is_integer()) { + ODR_WARNING(*m_logger, "pdf: skipping out-of-spec cidrange (low " + << low.size() << " bytes, high " << high.size() + << " bytes)"); + continue; // skip an out-of-spec range, keep parsing the rest + } + const std::uint32_t low_code = code_to_uint(low); + const std::uint32_t high_code = code_to_uint(high); + if (high_code < low_code) { + ODR_WARNING(*m_logger, "pdf: skipping reversed cidrange (low 0x" + << std::hex << low_code << ", high 0x" + << high_code << ")"); + continue; // a reversed range would map codes to nonsense CIDs + } + // Unlike a `bfrange`, a `cidrange` commonly spans the whole 2-byte code + // space (the identity block), so it is stored as a range rather than + // materialized code-by-code; no per-range span cap is needed. + cmap.add_cid_range(low_code, high_code, + static_cast(cid.as_integer()), + low.size()); + } +} + CMap CMapParser::parse_cmap() { CMap cmap; @@ -253,6 +305,10 @@ CMap CMapParser::parse_cmap() { read_bfchar(last_int, cmap); } else if (command == "beginbfrange") { read_bfrange(last_int, cmap); + } else if (command == "begincidchar") { + read_cidchar(last_int, cmap); + } else if (command == "begincidrange") { + read_cidrange(last_int, cmap); } } } diff --git a/src/odr/internal/pdf/pdf_cmap_parser.hpp b/src/odr/internal/pdf/pdf_cmap_parser.hpp index b19f47bd..aa9e5289 100644 --- a/src/odr/internal/pdf/pdf_cmap_parser.hpp +++ b/src/odr/internal/pdf/pdf_cmap_parser.hpp @@ -32,6 +32,8 @@ class CMapParser { void read_codespacerange(std::uint32_t n, CMap &cmap); void read_bfchar(std::uint32_t n, CMap &cmap); void read_bfrange(std::uint32_t n, CMap &cmap); + void read_cidchar(std::uint32_t n, CMap &cmap); + void read_cidrange(std::uint32_t n, CMap &cmap); }; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index a2d95182..5a04d029 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -243,11 +243,17 @@ struct Pattern final : Element { }; /// A non-owning view over a string of PDF character codes, splitting it into -/// fixed-width (`Font::code_byte_width()`) big-endian codes on iteration. Holds -/// only a `string_view`, so it must not outlive the underlying bytes; iterate -/// it directly (`for (std::uint32_t code : font.codes(...))`). Trailing bytes -/// shorter than a full code are dropped, matching the PDF text-showing -/// operators. +/// big-endian codes on iteration. Holds only a `string_view`, so it must not +/// outlive the underlying bytes; iterate it directly +/// (`for (std::uint32_t code : font.codes(...))`). Trailing bytes shorter than +/// a full code are dropped, matching the PDF text-showing operators. +/// +/// Code width is variable when a `codespace` CMap is supplied (its codespace +/// ranges pick each code's width from its first byte, ISO 32000-1 9.7.6.2) — so +/// a mixed 1-/2-byte encoding (a 1-byte space among 2-byte CIDs, as PDFClown +/// and other producers emit) stays aligned. Without a codespace (or when the +/// CMap declares none) it falls back to a fixed `Font::code_byte_width()`, +/// splitting the same way the fixed-width iterator historically did. class CodeRange { public: class Iterator { @@ -259,10 +265,24 @@ class CodeRange { using reference = std::uint32_t; Iterator() = default; - Iterator(const char *position, const std::size_t width) - : m_position{position}, m_width{width} {} + Iterator(const char *position, const char *end, const std::size_t width, + const CMap *codespace, const CMap *cid_map) + : m_position{position}, m_end{end}, m_fixed_width{width}, + m_codespace{codespace}, m_cid_map{cid_map} { + settle(); + } std::uint32_t operator*() const { + // A composite font's embedded CID `/Encoding` CMap maps the code to its + // CID (which is what the glyph/advance paths expect); without one the CID + // is the code itself (`Identity-H/V`), as it is for simple fonts. + if (m_cid_map != nullptr) { + if (const std::optional cid = + m_cid_map->cid_for_code(std::string_view(m_position, m_width)); + cid.has_value()) { + return *cid; + } + } std::uint32_t code = 0; for (std::size_t k = 0; k < m_width; ++k) { code = (code << 8) | static_cast(m_position[k]); @@ -272,6 +292,7 @@ class CodeRange { Iterator &operator++() { m_position += m_width; + settle(); return *this; } Iterator operator++(int) { @@ -285,32 +306,59 @@ class CodeRange { } private: + // Fix the width of the code at `m_position`, dropping a trailing partial + // code (fewer bytes than its declared width) by clamping to `m_end` so the + // iterator lands exactly on the end sentinel. + void settle() { + if (m_position >= m_end) { + m_position = m_end; + m_width = 0; + return; + } + const auto remaining = static_cast(m_end - m_position); + std::size_t width = m_fixed_width; + if (m_codespace != nullptr && m_codespace->has_codespace()) { + width = m_codespace->code_width(static_cast(*m_position)); + } + if (width == 0 || width > remaining) { + m_position = m_end; // drop the trailing partial code + m_width = 0; + return; + } + m_width = width; + } + const char *m_position{nullptr}; - std::size_t m_width{1}; + const char *m_end{nullptr}; + std::size_t m_fixed_width{1}; + const CMap *m_codespace{nullptr}; + const CMap *m_cid_map{nullptr}; + std::size_t m_width{0}; }; - CodeRange(const std::string_view codes, const std::size_t width) - : m_codes{codes}, m_width{width} {} + CodeRange(const std::string_view codes, const std::size_t width, + const CMap *codespace, const CMap *cid_map) + : m_codes{codes}, m_width{width}, m_codespace{codespace}, + m_cid_map{cid_map} {} // `data()` is used as a bounded byte range delimited by `end()`, not as a // null-terminated string; the suspicious-data-usage check does not apply. [[nodiscard]] Iterator begin() const { - return {m_codes.data(), - m_width}; // NOLINT(bugprone-suspicious-stringview-data-usage) + return {m_codes.data(), // NOLINT(bugprone-suspicious-stringview-data-usage) + m_codes.data() + m_codes.size(), m_width, m_codespace, m_cid_map}; } [[nodiscard]] Iterator end() const { - // round down to a whole number of codes - return { - m_codes.data() + - (m_codes.size() - - m_codes.size() % - m_width), // NOLINT(bugprone-suspicious-stringview-data-usage) - m_width}; + const char *stop = + m_codes.data() + // NOLINT(bugprone-suspicious-stringview-data-usage) + m_codes.size(); + return {stop, stop, m_width, m_codespace, m_cid_map}; } private: std::string_view m_codes; std::size_t m_width{}; + const CMap *m_codespace{nullptr}; + const CMap *m_cid_map{nullptr}; }; struct Font final : Element { @@ -336,6 +384,14 @@ struct Font final : Element { /// `Identity-H`, `UniGB-UCS2-H`); empty for an embedded CMap stream. Drives /// the predefined Unicode-CMap extraction path. std::string cid_encoding_name; + /// The composite font's `/Encoding` when it is an *embedded* CMap stream + /// (code -> CID plus codespace, ISO 32000-1 9.7.5.3); empty for a predefined + /// name (above) or a simple font. When it declares a CID map, `codes()` + /// yields CIDs through it — so a producer that mixes a 1-byte code (e.g. a + /// space) among 2-byte CIDs selects the right glyph and advance; when it + /// declares a codespace, that codespace (authoritative over `/ToUnicode`'s) + /// splits the code string. + CMap cid_encoding; /// Simple-font glyph metrics (ISO 32000-1 9.2.4): `/Widths`, in glyph space /// (1/1000 text-space units), indexed by `code - first_char`; codes outside @@ -362,10 +418,29 @@ struct Font final : Element { /// `Identity-H/V` and common CID case), 1 for simple fonts. [[nodiscard]] int code_byte_width() const { return composite ? 2 : 1; } - /// View `codes` as a sequence of character codes split per - /// `code_byte_width()`. The result borrows `codes`; do not outlive it. + /// View `codes` as a sequence of character codes, each yielded as the *CID* + /// it selects (identity when there is no CID map — the common case, and what + /// the glyph/advance paths expect). The result borrows `codes`; do not + /// outlive it. + /// + /// A composite font splits (and maps) by its embedded CID `/Encoding` CMap + /// when present — that CMap declares both the codespace and the code -> CID + /// map, so a producer mixing a 1-byte code among 2-byte CIDs (PDFClown et + /// al.) stays aligned and selects the right glyph. Failing that it splits by + /// the `ToUnicode` codespace (CIDs identity — the same split `to_unicode` + /// uses, keeping glyph/advance/Unicode consistent), and failing that by a + /// fixed 2 bytes (the common `Identity-H` case). Simple fonts split by a + /// fixed 1 byte with identity CIDs. [[nodiscard]] CodeRange codes(std::string_view codes) const { - return {codes, static_cast(code_byte_width())}; + const auto width = static_cast(code_byte_width()); + if (!composite) { + return {codes, width, nullptr, nullptr}; + } + const CMap *cid_map = cid_encoding.has_cid_map() ? &cid_encoding : nullptr; + const CMap *codespace = cid_encoding.has_codespace() ? &cid_encoding + : cmap.has_codespace() ? &cmap + : nullptr; + return {codes, width, codespace, cid_map}; } /// Embedded font: the SFNT parsed from `/FontFile2` (a simple TrueType font, diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index c5239ea8..f31fd64e 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -460,13 +460,26 @@ void parse_composite_font(DocumentParser &parser, const Dictionary &dictionary, font.composite = true; // The Type0 `/Encoding`: a predefined CMap name (`Identity-H`, - // `UniGB-UCS2-H`, …) or an embedded CMap stream. Record the name; the stream - // case is left empty (deferred). Drives the predefined Unicode-CMap path in - // `Font::to_unicode`. + // `UniGB-UCS2-H`, …) or an embedded CMap stream. A name drives the predefined + // Unicode-CMap path in `Font::to_unicode`; a stream is parsed into a + // code -> CID CMap (`Font::cid_encoding`) so `codes()` yields the right CIDs + // for a mixed-width encoding (ISO 32000-1 9.7.5.3). if (dictionary.has_key("Encoding")) { const Object encoding = parser.resolve_object_copy(dictionary["Encoding"]); if (encoding.is_name()) { font.cid_encoding_name = encoding.as_name(); + } else if (dictionary["Encoding"].is_reference()) { + try { + const std::string stream = + parser.read_decoded_stream(dictionary["Encoding"].as_reference()); + util::stream::ViewStream ss(stream); + CMapParser cmap_parser(ss, parser.logger()); + font.cid_encoding = cmap_parser.parse_cmap(); + } catch (const std::exception &e) { + ODR_WARNING( + parser.logger(), + "pdf: failed to read composite /Encoding CMap: " << e.what()); + } } } From 156258fde633a9c6ce8894881fcc4b18b2bf9860 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:30:55 +0200 Subject: [PATCH 3/8] PDF: pin substitute-font baseline with @font-face metric overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A non-embedded font renders in a local system font whose hhea/OS-2 metrics we don't control, so under line-height:1 the browser placed the glyph baseline at that font's ascent rather than the ascent_em we use to compute `top` — dropping e.g. the 120pt Times title well below the page bottom. Route each substitute through a generated @font-face ('odr-sN', src: local(...) of the family stack) carrying ascent-override/descent-override (summing to 1em) and line-gap-override:0, so the baseline is positioned from our metric. Faces are deduped by (family, style, ascent); the family stack is kept as a fallback. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/html/pdf_file.cpp | 116 +++++++++++++++++++++++++++-- src/odr/internal/pdf/AGENTS.md | 10 +++ 2 files changed, 121 insertions(+), 5 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 2622d01e..4089769c 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -142,6 +142,105 @@ std::string font_substitute_declaration(const pdf::FontSubstitute &substitute) { return declaration; } +/// The `local(...)` sources of a CSS `font-family` stack, dropping the generic +/// keywords an `@font-face src` cannot name. Returns e.g. +/// "local('Times New Roman'),local(Times)" for "'Times New Roman',Times,serif", +/// or "" when the stack names no concrete font (generic-only). +std::string local_font_sources(const std::string_view css_family) { + static constexpr std::array generics = { + "serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui"}; + std::string src; + std::size_t start = 0; + while (start <= css_family.size()) { + const std::size_t comma = css_family.find(',', start); + std::string_view name = css_family.substr( + start, comma == std::string_view::npos ? css_family.size() - start + : comma - start); + while (!name.empty() && name.front() == ' ') { + name.remove_prefix(1); + } + while (!name.empty() && name.back() == ' ') { + name.remove_suffix(1); + } + const bool generic = + std::find(generics.begin(), generics.end(), name) != generics.end(); + if (!name.empty() && !generic) { + if (!src.empty()) { + src += ','; + } + src += "local("; + src += name; + src += ')'; + } + if (comma == std::string_view::npos) { + break; + } + start = comma + 1; + } + return src; +} + +/// Registers one `@font-face` per (substitute family, style, ascent) that +/// overrides the face's ascent/descent so a glyph's baseline lands exactly at +/// the `top` `add_position_classes` derives from `ascent_em` — independent of +/// the metrics of whichever local font actually resolves. Without the override +/// the browser positions the baseline using the resolved font's own ascent, +/// which for a large non-embedded run (e.g. a 120pt Times title) drops it well +/// below the intended baseline. +class SubstituteFontFaces { +public: + /// The `font-family:...` (plus weight/style) declaration for `substitute`, + /// routed through a generated metric-overriding face `'odr-sN'`. Falls back + /// to the plain family stack when the stack names no concrete font. + std::string declaration(const pdf::FontSubstitute &substitute, + const double ascent_em) { + const std::string src = local_font_sources(substitute.css_family); + if (src.empty()) { + return font_substitute_declaration(substitute); + } + // ascent-override + descent-override sum to one em, so `line-height:1` + // leaves no leading and the baseline sits at exactly `ascent_em` of the em + // box. `ascent_em` is clamped to [0.5, 1.2]; the `max` keeps descent + // non-negative for the rare ascent > 1 (a slight baseline approximation). + const double ascent = ascent_em; + const double descent = std::max(0.0, 1.0 - ascent_em); + std::ostringstream key; + key << src << '|' << substitute.bold << '|' << substitute.italic << '|' + << std::llround(ascent * 1000.0); + const auto [it, inserted] = m_index_by_key.try_emplace( + std::move(key).str(), static_cast(m_faces.size()) + 1); + if (inserted) { + std::ostringstream face; + face << "@font-face{font-family:'odr-s" << it->second << "';src:" << src + << ";ascent-override:" << round2(ascent * 100.0) + << "%;descent-override:" << round2(descent * 100.0) + << "%;line-gap-override:0%}"; + m_faces.push_back(std::move(face).str()); + } + std::string declaration = "font-family:'odr-s" + + std::to_string(it->second) + "'," + + substitute.css_family; + if (substitute.bold) { + declaration += ";font-weight:bold"; + } + if (substitute.italic) { + declaration += ";font-style:italic"; + } + return declaration; + } + + /// Appends the collected `@font-face` rules to `out`. + void append_faces(std::string &out) const { + for (const std::string &face : m_faces) { + out += face; + } + } + +private: + std::map m_index_by_key; + std::vector m_faces; +}; + /// Build an SVG `d` attribute from a path's subpaths, each point mapped through /// `to_box` (PDF user space -> the page box, y-down). Lines become `L`, cubic /// Béziers `C`, and an explicitly closed subpath ends with `Z`. @@ -926,6 +1025,7 @@ class HtmlServiceImpl final : public HtmlService { std::uint32_t family_count = 0; std::string font_faces; std::string font_styles; // per-font `.fvN` (visible) / `.fnN` (invisible) + SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces std::vector accepted_fonts; // Which classes are used: [0]=fv (visible), [1]=fn (invisible). std::vector> font_class_used; @@ -1072,9 +1172,12 @@ class HtmlServiceImpl final : public HtmlService { run_classes += font_class(font_class_used, font, invisible); } else if (text.font != nullptr && text.font->substitute) { // Non-embedded font: render the real Unicode in the substitute - // family (embedded fonts carry the family in `font_class`). - add_class(run_classes, "ff", - font_substitute_declaration(*text.font->substitute)); + // family (embedded fonts carry the family in `font_class`). The + // metric-overriding face pins the baseline to `asc` (see + // `SubstituteFontFaces`). + add_class( + run_classes, "ff", + substitute_faces.declaration(*text.font->substitute, asc)); } if (vis_margin_pt != 0) { add_class(run_classes, "ml", pt_decl("margin-left", vis_margin_pt)); @@ -1242,6 +1345,7 @@ class HtmlServiceImpl final : public HtmlService { write_font_face(*accepted_fonts[i], i, {}, font_class_used[i], font_faces, font_styles); } + substitute_faces.append_faces(font_faces); // Write HTML. write_header_common(out, font_faces, font_styles, styles, [&] { @@ -1448,7 +1552,8 @@ class HtmlServiceImpl final : public HtmlService { std::uint32_t family_count = 0; std::string font_faces; - std::string font_styles; // ".fvN{...}" / ".fnN{...}" + std::string font_styles; // ".fvN{...}" / ".fnN{...}" + SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces std::vector accepted_fonts; // Per-font, per-uchar, per-glyph occurrence count (pre-pass). // Indexed by font_index - 1. @@ -1661,7 +1766,7 @@ class HtmlServiceImpl final : public HtmlService { const std::string substitute_declaration = (font == 0 && !invisible && text.font != nullptr && text.font->substitute) - ? font_substitute_declaration(*text.font->substitute) + ? substitute_faces.declaration(*text.font->substitute, asc) : std::string(); std::ostringstream fk; fk << font << '|' << invisible << '|' << font_size_pt << '|' << cs_pt @@ -1742,6 +1847,7 @@ class HtmlServiceImpl final : public HtmlService { write_font_face(*accepted_fonts[i], i, used_unicode[i], font_class_used[i], font_faces, font_styles); } + substitute_faces.append_faces(font_faces); // ---- Pass 2: write HTML --------------------------------------------- write_header_common(out, font_faces, font_styles, styles, [&] { diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 08e90aef..57747121 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -236,6 +236,16 @@ inline SVG per page. Experimental and not production-quality. `OS/2`/`hhea` (`font/cff_transform.cpp` `serialize_os2`/`serialize_hhea`; the SFNT path passes the originals through), so our ascent can match the browser's. + - **Non-embedded substitutes** render in a *local* system font whose + `hhea`/`OS/2` we do **not** control, so the box-top→baseline distance would + be that font's ascent, not our `ascent_em` — dropping e.g. a 120pt Times + title well below its intended baseline. `SubstituteFontFaces` (in + `pdf_file.cpp`) closes this by routing each substitute through a generated + `@font-face` (`'odr-sN'`, `src: local(...)` of the family stack) carrying + `ascent-override:ascent_em`, `descent-override:1−ascent_em`, + `line-gap-override:0` — so the browser positions the baseline from *our* + metric. Faces are deduped by (family, style, ascent); the family stack is + kept after `'odr-sN'` as a fallback for the rare unresolved local. - **`ascent_em`** (in `pdf_file.cpp`): FontDescriptor `/Ascent`, else the embedded font's `bounding_box().y_max / units_per_em()`, else `0.8` em (which matches `serialize_os2`'s degenerate 0.8/0.2 fallback, so the fallback font From 716995dea737e2eb12f729ab74d663133750fa91 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:41:36 +0200 Subject: [PATCH 4/8] PDF: log unknown graphics operators via Logger instead of std::cerr Thread a Logger into GraphicsOperatorParser (mirroring CMapParser) and emit ODR_DEBUG for unrecognized operators rather than writing to std::cerr. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/pdf_graphics_operator_parser.cpp | 11 +++++++---- src/odr/internal/pdf/pdf_graphics_operator_parser.hpp | 6 +++++- src/odr/internal/pdf/pdf_page_extractor.cpp | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp index 65eab8aa..12be34af 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp @@ -3,8 +3,9 @@ #include #include +#include + #include -#include #include #include @@ -193,8 +194,9 @@ using char_type = std::streambuf::char_type; using int_type = std::streambuf::int_type; static constexpr int_type eof = std::streambuf::traits_type::eof(); -GraphicsOperatorParser::GraphicsOperatorParser(std::istream &in) - : m_parser(in) {} +GraphicsOperatorParser::GraphicsOperatorParser(std::istream &in, + const Logger &logger) + : m_parser(in), m_logger{&logger} {} std::istream &GraphicsOperatorParser::in() { return m_parser.in(); } @@ -255,7 +257,8 @@ GraphicsOperator GraphicsOperatorParser::read_operator() { result.type = operator_name_to_type(operator_name); if (result.type == GraphicsOperatorType::unknown) { - std::cerr << "unknown operator: " << operator_name << '\n'; + ODR_DEBUG(*m_logger, + "pdf: unknown graphics operator '" + operator_name + "'"); } // `BI ID EI` is one inline image (8.9.7). The key/value diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp index e8a905d5..200445d9 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp @@ -2,6 +2,8 @@ #include +#include + #include #include @@ -13,7 +15,8 @@ struct GraphicsOperator; class GraphicsOperatorParser { public: - explicit GraphicsOperatorParser(std::istream &); + explicit GraphicsOperatorParser(std::istream &, + const Logger &logger = Logger::null()); [[nodiscard]] std::istream &in(); [[nodiscard]] std::streambuf &sb(); @@ -36,6 +39,7 @@ class GraphicsOperatorParser { read_inline_image_data(const Dictionary &dictionary); ObjectParser m_parser; + const Logger *m_logger{nullptr}; }; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_extractor.cpp b/src/odr/internal/pdf/pdf_page_extractor.cpp index 8541b388..333157cc 100644 --- a/src/odr/internal/pdf/pdf_page_extractor.cpp +++ b/src/odr/internal/pdf/pdf_page_extractor.cpp @@ -1005,7 +1005,7 @@ void run_content(const std::string &content, const Resources &resources, ActiveForms &active, MarkedContentStack &marked, std::optional &pen) { std::istringstream ss(content); - GraphicsOperatorParser parser(ss); + GraphicsOperatorParser parser(ss, logger); // Route a shown string through the Type3 char-proc renderer or the normal // text path, by the font kind. From 78c68aacfead6ab77f10861b68bd10bd4f5250ad Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:51:55 +0200 Subject: [PATCH 5/8] PDF: don't trust an embedded CMap's codespace when it uses usecmap An embedded Type0 /Encoding CMap that inherits a base via `usecmap` (e.g. `/Identity-H usecmap`) and then declares only a partial local codespacerange would have its partial codespace treated as authoritative by Font::codes(), so inherited 2-byte codes fell through code_width() to the 1-byte fallback and were mis-split. We don't resolve the inherited base CMap, so flag the `usecmap` operator at parse time and exclude an inherited codespace from has_codespace(). Callers then fall back to the ToUnicode codespace / fixed 2-byte width (the pre-CID-encoding behavior). Local cidchar/cidrange overrides are still applied; unmapped codes fall through to identity, correct for an Identity-H base. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/pdf_cmap.hpp | 25 ++++++++++++++++--- src/odr/internal/pdf/pdf_cmap_parser.cpp | 11 +++++++++ test/src/internal/pdf/pdf_cmap.cpp | 31 ++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp index 2dbc62bd..5131ce1b 100644 --- a/src/odr/internal/pdf/pdf_cmap.hpp +++ b/src/odr/internal/pdf/pdf_cmap.hpp @@ -34,11 +34,27 @@ class CMap { /// `ToUnicode` CMap); the caller then falls back to the `/Encoding`. [[nodiscard]] bool empty() const { return m_map.empty(); } - /// True when at least one codespace range was declared. When true, the code - /// widths this CMap implies are authoritative for splitting a code string - /// (see `code_width`); when false, callers fall back to a fixed width. + /// Records that the CMap stream referenced another CMap via `usecmap` (ISO + /// 32000-1 9.7.5.3). We do not resolve the inherited base, so whatever + /// codespace this stream declares locally is (potentially) incomplete and is + /// no longer treated as authoritative (see `has_codespace`). + void mark_inherits_external_cmap() { m_inherits_external_cmap = true; } + + /// True when this stream referenced an unresolved base CMap via `usecmap`. + [[nodiscard]] bool inherits_external_cmap() const { + return m_inherits_external_cmap; + } + + /// True when this CMap declares an authoritative codespace: at least one + /// range, and it does not inherit an unresolved base CMap via `usecmap`. When + /// true, the code widths this CMap implies are authoritative for splitting a + /// code string (see `code_width`); when false, callers fall back to another + /// CMap's codespace or a fixed width. An inherited (`usecmap`) codespace is + /// deliberately excluded — its local ranges may cover only an override + /// subset, so trusting them would mis-split the inherited (e.g. 2-byte) + /// codes. [[nodiscard]] bool has_codespace() const { - return !m_codespace_ranges.empty(); + return !m_codespace_ranges.empty() && !m_inherits_external_cmap; } /// Byte width of a code whose first byte is `first`, decided by the codespace @@ -81,6 +97,7 @@ class CMap { std::size_t width; // byte width of the codes (to disambiguate widths) }; + bool m_inherits_external_cmap{false}; std::vector m_codespace_ranges; std::unordered_map m_map; std::unordered_map m_cid_chars; diff --git a/src/odr/internal/pdf/pdf_cmap_parser.cpp b/src/odr/internal/pdf/pdf_cmap_parser.cpp index 24c01721..a34ee9e5 100644 --- a/src/odr/internal/pdf/pdf_cmap_parser.cpp +++ b/src/odr/internal/pdf/pdf_cmap_parser.cpp @@ -309,6 +309,17 @@ CMap CMapParser::parse_cmap() { read_cidchar(last_int, cmap); } else if (command == "begincidrange") { read_cidrange(last_int, cmap); + } else if (command == "usecmap") { + // `/BaseCMap usecmap`: this stream inherits another CMap's codespace + // and mappings (ISO 32000-1 9.7.5.3). We do not resolve the base, so + // any codespace declared locally is (potentially) an override-only + // subset; flag it so it is no longer treated as authoritative and + // callers fall back to the ToUnicode codespace / fixed width instead of + // mis-splitting the inherited (e.g. 2-byte) codes. + cmap.mark_inherits_external_cmap(); + ODR_WARNING(*m_logger, + "pdf: unresolved 'usecmap'; local codespace not treated as " + "authoritative"); } } } diff --git a/test/src/internal/pdf/pdf_cmap.cpp b/test/src/internal/pdf/pdf_cmap.cpp index b9a71406..137c6cec 100644 --- a/test/src/internal/pdf/pdf_cmap.cpp +++ b/test/src/internal/pdf/pdf_cmap.cpp @@ -4,6 +4,7 @@ #include #include +#include #include @@ -168,3 +169,33 @@ TEST(PdfCMap, mixed_code_widths) { EXPECT_EQ(cmap.translate_string(std::string("\x01\x81\x41", 3)), "AB"); } + +TEST(PdfCMap, codespace_is_authoritative_without_usecmap) { + CMap cmap = parse("1 begincodespacerange\n" + "<0000> \n" + "endcodespacerange\n"); + + EXPECT_FALSE(cmap.inherits_external_cmap()); + EXPECT_TRUE(cmap.has_codespace()); + EXPECT_EQ(cmap.code_width(0x00), 2); +} + +TEST(PdfCMap, usecmap_disables_local_codespace_authority) { + // A stream that inherits `Identity-H` (2-byte codespace) via `usecmap` and + // only declares a 1-byte override codespace must not be trusted to split + // codes, or the inherited 2-byte codes would be mis-split as single bytes. + CMap cmap = parse("/Identity-H usecmap\n" + "1 begincodespacerange\n" + "<20> <20>\n" + "endcodespacerange\n" + "1 begincidchar\n" + "<20> 1\n" + "endcidchar\n"); + + EXPECT_TRUE(cmap.inherits_external_cmap()); + // The partial local codespace is no longer authoritative. + EXPECT_FALSE(cmap.has_codespace()); + // The local CID override is still applied. + EXPECT_TRUE(cmap.has_cid_map()); + EXPECT_EQ(cmap.cid_for_code(std::string_view("\x20", 1)), 1u); +} From 495eb2b6226e4059713001503383f730fd40ff30 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Jul 2026 23:52:30 +0200 Subject: [PATCH 6/8] update refs --- test/data/reference-output/odr-private | 2 +- test/data/reference-output/odr-public | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 3fc6efc6..429ca6ac 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 3fc6efc6a2ded86e4ea9baab5bac2c59086bdfcd +Subproject commit 429ca6ac35880a35c138e5bad4505e8b79be685d diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index 2f03cf25..51f96987 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit 2f03cf258064e92ea38b8f0811d5ce97dc2faafe +Subproject commit 51f969870b99b58ff807471b60f01f7668b43578 From 2cbe09292f07ba071097bfe40a511a1b47a7cf01 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sun, 5 Jul 2026 00:10:40 +0200 Subject: [PATCH 7/8] PDF: use doc-comment style for CidRange field comments Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/pdf_cmap.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp index 5131ce1b..5117aad1 100644 --- a/src/odr/internal/pdf/pdf_cmap.hpp +++ b/src/odr/internal/pdf/pdf_cmap.hpp @@ -91,10 +91,10 @@ class CMap { }; struct CidRange { - std::uint32_t low; // numeric value of the low code - std::uint32_t high; // numeric value of the high code - std::uint32_t base_cid; // CID of the low code - std::size_t width; // byte width of the codes (to disambiguate widths) + std::uint32_t low; ///< numeric value of the low code + std::uint32_t high; ///< numeric value of the high code + std::uint32_t base_cid; ///< CID of the low code + std::size_t width; ///< byte width of the codes (to disambiguate widths) }; bool m_inherits_external_cmap{false}; From ff613ef7517c98f6980c60166c65386d2e5d9eff Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sun, 5 Jul 2026 00:10:40 +0200 Subject: [PATCH 8/8] PDF: represent CodeRange::Iterator cursor as a shrinking string_view Replace the paired `const char *position, const char *end` cursor with a single `string_view` that shrinks as codes are consumed. Bundling the two pointers removes the mismatched-position/end footgun and lets `settle()` use `size()` directly instead of pointer subtraction. `begin()`/`end()` now pass the view (and an empty view at its tail) rather than reaching into `data()`, dropping both suspicious-stringview-data-usage suppressions. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/pdf_document_element.hpp | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index 5a04d029..3bc97ca8 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -265,10 +265,10 @@ class CodeRange { using reference = std::uint32_t; Iterator() = default; - Iterator(const char *position, const char *end, const std::size_t width, + Iterator(const std::string_view range, const std::size_t width, const CMap *codespace, const CMap *cid_map) - : m_position{position}, m_end{end}, m_fixed_width{width}, - m_codespace{codespace}, m_cid_map{cid_map} { + : m_range{range}, m_fixed_width{width}, m_codespace{codespace}, + m_cid_map{cid_map} { settle(); } @@ -278,20 +278,20 @@ class CodeRange { // is the code itself (`Identity-H/V`), as it is for simple fonts. if (m_cid_map != nullptr) { if (const std::optional cid = - m_cid_map->cid_for_code(std::string_view(m_position, m_width)); + m_cid_map->cid_for_code(m_range.substr(0, m_width)); cid.has_value()) { return *cid; } } std::uint32_t code = 0; for (std::size_t k = 0; k < m_width; ++k) { - code = (code << 8) | static_cast(m_position[k]); + code = (code << 8) | static_cast(m_range[k]); } return code; } Iterator &operator++() { - m_position += m_width; + m_range.remove_prefix(m_width); settle(); return *this; } @@ -302,34 +302,32 @@ class CodeRange { } bool operator==(const Iterator &other) const { - return m_position == other.m_position; + return m_range.data() == other.m_range.data(); } private: - // Fix the width of the code at `m_position`, dropping a trailing partial - // code (fewer bytes than its declared width) by clamping to `m_end` so the - // iterator lands exactly on the end sentinel. + /// Fix the width of the code at the front of `m_range`, dropping a trailing + /// partial code (fewer bytes than its declared width) by consuming the rest + /// of the range so the iterator lands exactly on the end sentinel. void settle() { - if (m_position >= m_end) { - m_position = m_end; + if (m_range.empty()) { m_width = 0; return; } - const auto remaining = static_cast(m_end - m_position); std::size_t width = m_fixed_width; if (m_codespace != nullptr && m_codespace->has_codespace()) { - width = m_codespace->code_width(static_cast(*m_position)); + width = + m_codespace->code_width(static_cast(m_range.front())); } - if (width == 0 || width > remaining) { - m_position = m_end; // drop the trailing partial code + if (width == 0 || width > m_range.size()) { + m_range.remove_prefix(m_range.size()); // drop the trailing partial code m_width = 0; return; } m_width = width; } - const char *m_position{nullptr}; - const char *m_end{nullptr}; + std::string_view m_range; std::size_t m_fixed_width{1}; const CMap *m_codespace{nullptr}; const CMap *m_cid_map{nullptr}; @@ -341,17 +339,14 @@ class CodeRange { : m_codes{codes}, m_width{width}, m_codespace{codespace}, m_cid_map{cid_map} {} - // `data()` is used as a bounded byte range delimited by `end()`, not as a - // null-terminated string; the suspicious-data-usage check does not apply. [[nodiscard]] Iterator begin() const { - return {m_codes.data(), // NOLINT(bugprone-suspicious-stringview-data-usage) - m_codes.data() + m_codes.size(), m_width, m_codespace, m_cid_map}; + return {m_codes, m_width, m_codespace, m_cid_map}; } [[nodiscard]] Iterator end() const { - const char *stop = - m_codes.data() + // NOLINT(bugprone-suspicious-stringview-data-usage) - m_codes.size(); - return {stop, stop, m_width, m_codespace, m_cid_map}; + // An empty view positioned at the end of the codes: the exhausted `begin()` + // iterator consumes down to this same one-past-the-end pointer, so the two + // compare equal (`operator==` matches on `data()`). + return {m_codes.substr(m_codes.size()), m_width, m_codespace, m_cid_map}; } private: