From ff87b61e5819eeaa4a29a774c1d3d39a83f56adf Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:07:38 +0200
Subject: [PATCH 1/8] PDF: decode reverse-solidus escapes in string literals

The literal-string parser appended the raw character after a backslash for
non-octal escapes, so `\n \r \t \b \f` yielded the letters 'n' 'r' 't' 'b'
'f' instead of their control bytes (ISO 32000-1 7.3.4.2, Table 3). Any glyph
whose byte code had one of those low bytes (common in 2-byte CFF CID codes)
decoded to the wrong code, producing .notdef boxes and garbled selectable
text. Translate the full escape set and handle `\`-before-EOL line
continuation. Covered by new PdfObjectParser string tests.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/AGENTS.md              | 13 +++---
 src/odr/internal/pdf/pdf_object_parser.cpp  | 33 +++++++++++++-
 test/src/internal/pdf/pdf_object_parser.cpp | 48 +++++++++++++++++++++
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md
index 3d80eeaa..bb3e3c2c 100644
--- a/src/odr/internal/pdf/AGENTS.md
+++ b/src/odr/internal/pdf/AGENTS.md
@@ -31,8 +31,10 @@ inline SVG per page. Experimental and not production-quality.
   (`DecoderEngine::odr`); `is_decodable()` returns `false` and `file_meta()`
   carries only the file type. All parsing is lazy, on HTML request.
 - **Object syntax**: null, booleans, integers/reals, names (incl. `#xx`
-  escapes), literal strings (`\` and `\ooo` escapes), hex strings, arrays,
-  dictionaries, indirect references (`n g R`) — standalone and nested.
+  escapes), literal strings (the `\n \r \t \b \f` control escapes, `\ddd`
+  octal, escaped delimiters, and `\`-before-EOL line continuation — Table 3),
+  hex strings, arrays, dictionaries, indirect references (`n g R`) —
+  standalone and nested.
 - **File structure**: header, `n g obj … endobj`, `stream` payloads (via
   `/Length`, with a scan-to-`endstream` fallback), classic `xref` tables,
   `trailer`, `startxref`, `%%EOF`; both sequential reading (`read_entry`) and
@@ -565,9 +567,10 @@ the tables land.
   there is no `/ToUnicode` (with an unreachable glyph staying unmapped) and a
   `/ToUnicode` CMap taking precedence over the reverse map.
 
-No assertion-based coverage of the tokenizer (escapes, references, hex strings)
-or the HTML output itself (the span emission / CSS transform mapping, incl. the
-dual-layer glyph/Unicode emission).
+The tokenizer's string parsing is covered (`PdfObjectParser`: literal-string
+control/octal/delimiter/line-continuation escapes and hex strings); references
+and the HTML output itself (the span emission / CSS transform mapping, incl. the
+dual-layer glyph/Unicode emission) are not yet asserted.
 
 ---
 
diff --git a/src/odr/internal/pdf/pdf_object_parser.cpp b/src/odr/internal/pdf/pdf_object_parser.cpp
index 357cf5ea..924ae24b 100644
--- a/src/odr/internal/pdf/pdf_object_parser.cpp
+++ b/src/odr/internal/pdf/pdf_object_parser.cpp
@@ -368,7 +368,38 @@ std::variant<StandardString, HexString> ObjectParser::read_string() {
           string += three_octet_to_char(octet[0], octet[1], octet[2]);
         } else {
           bumpc();
-          string += c;
+          // Reverse-solidus escapes (ISO 32000-1 7.3.4.2, Table 3): the control
+          // escapes translate to their byte value, a backslash before an
+          // end-of-line marker is a line continuation (both elided), and any
+          // other escaped character (including `(`, `)`, `\`) stands for
+          // itself.
+          switch (c) {
+          case 'n':
+            string += '\n';
+            break;
+          case 'r':
+            string += '\r';
+            break;
+          case 't':
+            string += '\t';
+            break;
+          case 'b':
+            string += '\b';
+            break;
+          case 'f':
+            string += '\f';
+            break;
+          case '\n':
+            break;
+          case '\r':
+            if (geti() == '\n') {
+              bumpc();
+            }
+            break;
+          default:
+            string += c;
+            break;
+          }
         }
         continue;
       }
diff --git a/test/src/internal/pdf/pdf_object_parser.cpp b/test/src/internal/pdf/pdf_object_parser.cpp
index 8d56fc85..25b7f5e7 100644
--- a/test/src/internal/pdf/pdf_object_parser.cpp
+++ b/test/src/internal/pdf/pdf_object_parser.cpp
@@ -16,6 +16,12 @@ std::string read_hex_string(const std::string &input) {
   return std::get<HexString>(parser.read_string()).string;
 }
 
+std::string read_standard_string(const std::string &input) {
+  std::istringstream in(input);
+  ObjectParser parser(in);
+  return std::get<StandardString>(parser.read_string()).string;
+}
+
 Real read_number(const std::string &input) {
   std::istringstream in(input);
   ObjectParser parser(in);
@@ -128,6 +134,48 @@ TEST(PdfObjectParser, skip_past) {
             Result(true, "X"));
 }
 
+// 7.3.4.2: a literal string is the bytes between balanced parentheses.
+TEST(PdfObjectParser, standard_string_basic) {
+  EXPECT_EQ(read_standard_string("(Hello)"), "Hello");
+  EXPECT_EQ(read_standard_string("()"), "");
+  EXPECT_EQ(read_standard_string("(a b\tc)"), "a b\tc");
+}
+
+// 7.3.4.2, Table 3: the control escapes translate to their byte value. A CFF
+// CID font's code string carries e.g. `\b` for the byte 0x08, so failing to
+// translate it corrupts the code (the encrypted_fontfile3_opentype regression).
+TEST(PdfObjectParser, standard_string_control_escapes) {
+  EXPECT_EQ(read_standard_string("(a\\nb)"), "a\nb");
+  EXPECT_EQ(read_standard_string("(a\\rb)"), "a\rb");
+  EXPECT_EQ(read_standard_string("(a\\tb)"), "a\tb");
+  EXPECT_EQ(read_standard_string("(a\\bb)"), "a\bb");
+  EXPECT_EQ(read_standard_string("(a\\fb)"), "a\fb");
+  // a two-byte code <00 08> written with the byte 0x08 escaped as `\b`
+  EXPECT_EQ(read_standard_string(std::string("(\000\\b)", 5)),
+            std::string("\000\b", 2));
+}
+
+// 7.3.4.2: an escaped `(`, `)` or `\` stands for itself, and any other escaped
+// character keeps the character while dropping the backslash.
+TEST(PdfObjectParser, standard_string_literal_escapes) {
+  EXPECT_EQ(read_standard_string("(a\\(b\\)c)"), "a(b)c");
+  EXPECT_EQ(read_standard_string("(a\\\\b)"), "a\\b");
+  EXPECT_EQ(read_standard_string("(a\\qb)"), "aqb");
+}
+
+// 7.3.4.2: a `\ddd` octal escape (1-3 digits) is the byte of that value.
+TEST(PdfObjectParser, standard_string_octal_escape) {
+  EXPECT_EQ(read_standard_string("(\\101)"), "A");
+  EXPECT_EQ(read_standard_string("(\\000)"), std::string("\000", 1));
+}
+
+// 7.3.4.2: a backslash before an end-of-line marker is a line continuation;
+// both the backslash and the marker are dropped.
+TEST(PdfObjectParser, standard_string_line_continuation) {
+  EXPECT_EQ(read_standard_string("(a\\\nb)"), "ab");
+  EXPECT_EQ(read_standard_string("(a\\\r\nb)"), "ab");
+}
+
 // 7.3.4.3: a hex string is the bytes of its hex digits, with whitespace ignored
 // and an odd final digit assumed to be followed by a 0.
 TEST(PdfObjectParser, hex_string_basic) {

From 1ac93055b16c0b67c467fac73bcf07c94fee87cb Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:07:58 +0200
Subject: [PATCH 2/8] PDF: parse composite /Encoding CMap for CID mapping

An embedded Type0 /Encoding CMap stream is now parsed (cidchar/cidrange ->
Font::cid_encoding) so Font::codes() maps char codes to CIDs through it and
uses the CMap's authoritative codespace. This fixes producers that mix a
1-byte code (e.g. a space) among 2-byte CIDs: without the CMap the space fell
through identity to the wrong glyph and advance. CMap gains cid_for_code /
add_cid_range / has_cid_map; the parser gains read_cidchar / read_cidrange.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/AGENTS.md                |   6 +-
 src/odr/internal/pdf/pdf_cmap.cpp             |  38 +++++-
 src/odr/internal/pdf/pdf_cmap.hpp             |  51 ++++++++
 src/odr/internal/pdf/pdf_cmap_parser.cpp      |  56 +++++++++
 src/odr/internal/pdf/pdf_cmap_parser.hpp      |   2 +
 src/odr/internal/pdf/pdf_document_element.hpp | 119 ++++++++++++++----
 src/odr/internal/pdf/pdf_document_parser.cpp  |  19 ++-
 7 files changed, 262 insertions(+), 29 deletions(-)

diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md
index bb3e3c2c..08e90aef 100644
--- a/src/odr/internal/pdf/AGENTS.md
+++ b/src/odr/internal/pdf/AGENTS.md
@@ -105,7 +105,11 @@ inline SVG per page. Experimental and not production-quality.
   (Type0) fonts** are recognized: the descendant CIDFont's
   `/CIDSystemInfo` `/Registry`/`/Ordering` is recorded on the `Font`, and the
   Type0 `/Encoding` (a code → CID CMap such as `Identity-H`) is kept out of the
-  simple-font encoding path. Extraction is driven by the `/ToUnicode` CMap (the
+  simple-font encoding path. An *embedded* `/Encoding` CMap stream is parsed
+  (`cidchar`/`cidrange` → `Font::cid_encoding`) so `Font::codes()` yields CIDs
+  through it — this also carries the authoritative codespace, so a producer that
+  mixes a 1-byte code (e.g. a space) among 2-byte CIDs stays aligned and selects
+  the right glyph/advance. Extraction is driven by the `/ToUnicode` CMap (the
   common case — every Type0 font in the corpus carries one). When a composite
   font has no `/ToUnicode`, a **predefined Unicode `/Encoding`** — the
   `Uni*-UCS2/UTF16/UTF32` CMaps — is decoded directly (`pdf_cid`), since those
diff --git a/src/odr/internal/pdf/pdf_cmap.cpp b/src/odr/internal/pdf/pdf_cmap.cpp
index af03e080..aa14c682 100644
--- a/src/odr/internal/pdf/pdf_cmap.cpp
+++ b/src/odr/internal/pdf/pdf_cmap.cpp
@@ -15,9 +15,36 @@ void CMap::map_single(std::string code, std::u16string unicode) {
   m_map[std::move(code)] = std::move(unicode);
 }
 
-std::size_t CMap::code_length(const std::string &codes,
-                              const std::size_t pos) const {
-  const auto first = static_cast<std::uint8_t>(codes[pos]);
+void CMap::map_cid_char(std::string code, const std::uint32_t cid) {
+  m_cid_chars[std::move(code)] = cid;
+}
+
+void CMap::add_cid_range(const std::uint32_t low, const std::uint32_t high,
+                         const std::uint32_t base_cid,
+                         const std::size_t width) {
+  m_cid_ranges.push_back({low, high, base_cid, width});
+}
+
+std::optional<std::uint32_t>
+CMap::cid_for_code(const std::string_view code) const {
+  if (const auto it = m_cid_chars.find(std::string(code));
+      it != m_cid_chars.end()) {
+    return it->second;
+  }
+  std::uint32_t value = 0;
+  for (const char c : code) {
+    value = (value << 8) | static_cast<std::uint8_t>(c);
+  }
+  for (const CidRange &range : m_cid_ranges) {
+    if (range.width == code.size() && value >= range.low &&
+        value <= range.high) {
+      return range.base_cid + (value - range.low);
+    }
+  }
+  return std::nullopt;
+}
+
+std::size_t CMap::code_width(const std::uint8_t first) const {
   for (const CodespaceRange &range : m_codespace_ranges) {
     if (first >= static_cast<std::uint8_t>(range.low.front()) &&
         first <= static_cast<std::uint8_t>(range.high.front())) {
@@ -30,6 +57,11 @@ std::size_t CMap::code_length(const std::string &codes,
   return 1;
 }
 
+std::size_t CMap::code_length(const std::string &codes,
+                              const std::size_t pos) const {
+  return code_width(static_cast<std::uint8_t>(codes[pos]));
+}
+
 std::string CMap::translate_string(const std::string &codes) const {
   std::u16string result;
 
diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp
index 65ee351d..2dbc62bd 100644
--- a/src/odr/internal/pdf/pdf_cmap.hpp
+++ b/src/odr/internal/pdf/pdf_cmap.hpp
@@ -1,7 +1,10 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
+#include <optional>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
@@ -18,12 +21,51 @@ class CMap {
   void add_codespace_range(std::string low_code, std::string high_code);
   void map_single(std::string code, std::u16string unicode);
 
+  /// CID mapping (a composite font's `/Encoding` CMap stream, ISO 32000-1
+  /// 9.7.5.3): `cidchar` maps a single code, `cidrange` maps a contiguous block
+  /// (`base_cid + (code - low)`). Codes are keyed by their raw big-endian bytes
+  /// / width so a 1-byte and a 2-byte code with the same numeric value stay
+  /// distinct (`<20>` -> CID 229 vs `<0020>` -> CID 32), the mixed-width case.
+  void map_cid_char(std::string code, std::uint32_t cid);
+  void add_cid_range(std::uint32_t low, std::uint32_t high,
+                     std::uint32_t base_cid, std::size_t width);
+
   /// True when no code -> Unicode mapping was parsed (e.g. the font carries no
   /// `ToUnicode` CMap); the caller then falls back to the `/Encoding`.
   [[nodiscard]] bool empty() const { return m_map.empty(); }
 
+  /// True when at least one codespace range was declared. When true, the code
+  /// widths this CMap implies are authoritative for splitting a code string
+  /// (see `code_width`); when false, callers fall back to a fixed width.
+  [[nodiscard]] bool has_codespace() const {
+    return !m_codespace_ranges.empty();
+  }
+
+  /// Byte width of a code whose first byte is `first`, decided by the codespace
+  /// ranges (matched on the first byte, ISO 32000-1 9.7.6.2). Falls back to a
+  /// single byte when no range declares/matches it. This is the variable-width
+  /// split that `translate_string` uses; exposing it lets the glyph/advance
+  /// paths split codes identically, so a mixed 1-/2-byte codespace (e.g. a
+  /// 1-byte space among 2-byte CIDs) stays aligned across both.
+  [[nodiscard]] std::size_t code_width(std::uint8_t first) const;
+
   [[nodiscard]] std::string translate_string(const std::string &codes) const;
 
+  /// True when at least one `cidchar`/`cidrange` mapping was parsed (an
+  /// embedded CID `/Encoding` CMap). When false the composite code -> CID is
+  /// identity
+  /// (`Identity-H/V`).
+  [[nodiscard]] bool has_cid_map() const {
+    return !m_cid_chars.empty() || !m_cid_ranges.empty();
+  }
+
+  /// The CID a code (raw big-endian bytes) selects, or `nullopt` when no
+  /// `cidchar`/`cidrange` covers it; the caller then falls back to identity
+  /// (CID = code). The width of `code` is matched, so a 1-byte code never hits
+  /// a 2-byte range.
+  [[nodiscard]] std::optional<std::uint32_t>
+  cid_for_code(std::string_view code) const;
+
 private:
   struct CodespaceRange {
     // `low` and `high` share the same length; that length is the code width in
@@ -32,8 +74,17 @@ class CMap {
     std::string high;
   };
 
+  struct CidRange {
+    std::uint32_t low;      // numeric value of the low code
+    std::uint32_t high;     // numeric value of the high code
+    std::uint32_t base_cid; // CID of the low code
+    std::size_t width;      // byte width of the codes (to disambiguate widths)
+  };
+
   std::vector<CodespaceRange> m_codespace_ranges;
   std::unordered_map<std::string, std::u16string> m_map;
+  std::unordered_map<std::string, std::uint32_t> m_cid_chars;
+  std::vector<CidRange> m_cid_ranges;
 
   /// Byte width of the code starting at `pos`, decided by the codespace ranges;
   /// falls back to a single byte when no range declares/matches it.
diff --git a/src/odr/internal/pdf/pdf_cmap_parser.cpp b/src/odr/internal/pdf/pdf_cmap_parser.cpp
index 008d6768..24c01721 100644
--- a/src/odr/internal/pdf/pdf_cmap_parser.cpp
+++ b/src/odr/internal/pdf/pdf_cmap_parser.cpp
@@ -228,6 +228,58 @@ void CMapParser::read_bfrange(const std::uint32_t n, CMap &cmap) {
   }
 }
 
+void CMapParser::read_cidchar(const std::uint32_t n, CMap &cmap) {
+  m_parser.skip_whitespace();
+  for (std::uint32_t i = 0; i < n; ++i) {
+    std::string code = m_parser.read_object().as_string();
+    m_parser.skip_whitespace();
+    const Object cid = m_parser.read_object();
+    m_parser.skip_whitespace();
+
+    if (!valid_code_width(code.size()) || !cid.is_integer()) {
+      ODR_WARNING(*m_logger, "pdf: skipping malformed cidchar entry (code "
+                                 << code.size() << " bytes)");
+      continue; // skip a malformed mapping, keep the rest of the CMap
+    }
+    cmap.map_cid_char(std::move(code),
+                      static_cast<std::uint32_t>(cid.as_integer()));
+  }
+}
+
+void CMapParser::read_cidrange(const std::uint32_t n, CMap &cmap) {
+  m_parser.skip_whitespace();
+  for (std::uint32_t i = 0; i < n; ++i) {
+    std::string low = m_parser.read_object().as_string();
+    m_parser.skip_whitespace();
+    std::string high = m_parser.read_object().as_string();
+    m_parser.skip_whitespace();
+    const Object cid = m_parser.read_object();
+    m_parser.skip_whitespace();
+
+    if (low.size() != high.size() || !valid_code_width(low.size()) ||
+        !cid.is_integer()) {
+      ODR_WARNING(*m_logger, "pdf: skipping out-of-spec cidrange (low "
+                                 << low.size() << " bytes, high " << high.size()
+                                 << " bytes)");
+      continue; // skip an out-of-spec range, keep parsing the rest
+    }
+    const std::uint32_t low_code = code_to_uint(low);
+    const std::uint32_t high_code = code_to_uint(high);
+    if (high_code < low_code) {
+      ODR_WARNING(*m_logger, "pdf: skipping reversed cidrange (low 0x"
+                                 << std::hex << low_code << ", high 0x"
+                                 << high_code << ")");
+      continue; // a reversed range would map codes to nonsense CIDs
+    }
+    // Unlike a `bfrange`, a `cidrange` commonly spans the whole 2-byte code
+    // space (the identity block), so it is stored as a range rather than
+    // materialized code-by-code; no per-range span cap is needed.
+    cmap.add_cid_range(low_code, high_code,
+                       static_cast<std::uint32_t>(cid.as_integer()),
+                       low.size());
+  }
+}
+
 CMap CMapParser::parse_cmap() {
   CMap cmap;
 
@@ -253,6 +305,10 @@ CMap CMapParser::parse_cmap() {
         read_bfchar(last_int, cmap);
       } else if (command == "beginbfrange") {
         read_bfrange(last_int, cmap);
+      } else if (command == "begincidchar") {
+        read_cidchar(last_int, cmap);
+      } else if (command == "begincidrange") {
+        read_cidrange(last_int, cmap);
       }
     }
   }
diff --git a/src/odr/internal/pdf/pdf_cmap_parser.hpp b/src/odr/internal/pdf/pdf_cmap_parser.hpp
index b19f47bd..aa9e5289 100644
--- a/src/odr/internal/pdf/pdf_cmap_parser.hpp
+++ b/src/odr/internal/pdf/pdf_cmap_parser.hpp
@@ -32,6 +32,8 @@ class CMapParser {
   void read_codespacerange(std::uint32_t n, CMap &cmap);
   void read_bfchar(std::uint32_t n, CMap &cmap);
   void read_bfrange(std::uint32_t n, CMap &cmap);
+  void read_cidchar(std::uint32_t n, CMap &cmap);
+  void read_cidrange(std::uint32_t n, CMap &cmap);
 };
 
 } // namespace odr::internal::pdf
diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp
index a2d95182..5a04d029 100644
--- a/src/odr/internal/pdf/pdf_document_element.hpp
+++ b/src/odr/internal/pdf/pdf_document_element.hpp
@@ -243,11 +243,17 @@ struct Pattern final : Element {
 };
 
 /// A non-owning view over a string of PDF character codes, splitting it into
-/// fixed-width (`Font::code_byte_width()`) big-endian codes on iteration. Holds
-/// only a `string_view`, so it must not outlive the underlying bytes; iterate
-/// it directly (`for (std::uint32_t code : font.codes(...))`). Trailing bytes
-/// shorter than a full code are dropped, matching the PDF text-showing
-/// operators.
+/// big-endian codes on iteration. Holds only a `string_view`, so it must not
+/// outlive the underlying bytes; iterate it directly
+/// (`for (std::uint32_t code : font.codes(...))`). Trailing bytes shorter than
+/// a full code are dropped, matching the PDF text-showing operators.
+///
+/// Code width is variable when a `codespace` CMap is supplied (its codespace
+/// ranges pick each code's width from its first byte, ISO 32000-1 9.7.6.2) — so
+/// a mixed 1-/2-byte encoding (a 1-byte space among 2-byte CIDs, as PDFClown
+/// and other producers emit) stays aligned. Without a codespace (or when the
+/// CMap declares none) it falls back to a fixed `Font::code_byte_width()`,
+/// splitting the same way the fixed-width iterator historically did.
 class CodeRange {
 public:
   class Iterator {
@@ -259,10 +265,24 @@ class CodeRange {
     using reference = std::uint32_t;
 
     Iterator() = default;
-    Iterator(const char *position, const std::size_t width)
-        : m_position{position}, m_width{width} {}
+    Iterator(const char *position, const char *end, const std::size_t width,
+             const CMap *codespace, const CMap *cid_map)
+        : m_position{position}, m_end{end}, m_fixed_width{width},
+          m_codespace{codespace}, m_cid_map{cid_map} {
+      settle();
+    }
 
     std::uint32_t operator*() const {
+      // A composite font's embedded CID `/Encoding` CMap maps the code to its
+      // CID (which is what the glyph/advance paths expect); without one the CID
+      // is the code itself (`Identity-H/V`), as it is for simple fonts.
+      if (m_cid_map != nullptr) {
+        if (const std::optional<std::uint32_t> cid =
+                m_cid_map->cid_for_code(std::string_view(m_position, m_width));
+            cid.has_value()) {
+          return *cid;
+        }
+      }
       std::uint32_t code = 0;
       for (std::size_t k = 0; k < m_width; ++k) {
         code = (code << 8) | static_cast<unsigned char>(m_position[k]);
@@ -272,6 +292,7 @@ class CodeRange {
 
     Iterator &operator++() {
       m_position += m_width;
+      settle();
       return *this;
     }
     Iterator operator++(int) {
@@ -285,32 +306,59 @@ class CodeRange {
     }
 
   private:
+    // Fix the width of the code at `m_position`, dropping a trailing partial
+    // code (fewer bytes than its declared width) by clamping to `m_end` so the
+    // iterator lands exactly on the end sentinel.
+    void settle() {
+      if (m_position >= m_end) {
+        m_position = m_end;
+        m_width = 0;
+        return;
+      }
+      const auto remaining = static_cast<std::size_t>(m_end - m_position);
+      std::size_t width = m_fixed_width;
+      if (m_codespace != nullptr && m_codespace->has_codespace()) {
+        width = m_codespace->code_width(static_cast<std::uint8_t>(*m_position));
+      }
+      if (width == 0 || width > remaining) {
+        m_position = m_end; // drop the trailing partial code
+        m_width = 0;
+        return;
+      }
+      m_width = width;
+    }
+
     const char *m_position{nullptr};
-    std::size_t m_width{1};
+    const char *m_end{nullptr};
+    std::size_t m_fixed_width{1};
+    const CMap *m_codespace{nullptr};
+    const CMap *m_cid_map{nullptr};
+    std::size_t m_width{0};
   };
 
-  CodeRange(const std::string_view codes, const std::size_t width)
-      : m_codes{codes}, m_width{width} {}
+  CodeRange(const std::string_view codes, const std::size_t width,
+            const CMap *codespace, const CMap *cid_map)
+      : m_codes{codes}, m_width{width}, m_codespace{codespace},
+        m_cid_map{cid_map} {}
 
   // `data()` is used as a bounded byte range delimited by `end()`, not as a
   // null-terminated string; the suspicious-data-usage check does not apply.
   [[nodiscard]] Iterator begin() const {
-    return {m_codes.data(),
-            m_width}; // NOLINT(bugprone-suspicious-stringview-data-usage)
+    return {m_codes.data(), // NOLINT(bugprone-suspicious-stringview-data-usage)
+            m_codes.data() + m_codes.size(), m_width, m_codespace, m_cid_map};
   }
   [[nodiscard]] Iterator end() const {
-    // round down to a whole number of codes
-    return {
-        m_codes.data() +
-            (m_codes.size() -
-             m_codes.size() %
-                 m_width), // NOLINT(bugprone-suspicious-stringview-data-usage)
-        m_width};
+    const char *stop =
+        m_codes.data() + // NOLINT(bugprone-suspicious-stringview-data-usage)
+        m_codes.size();
+    return {stop, stop, m_width, m_codespace, m_cid_map};
   }
 
 private:
   std::string_view m_codes;
   std::size_t m_width{};
+  const CMap *m_codespace{nullptr};
+  const CMap *m_cid_map{nullptr};
 };
 
 struct Font final : Element {
@@ -336,6 +384,14 @@ struct Font final : Element {
   /// `Identity-H`, `UniGB-UCS2-H`); empty for an embedded CMap stream. Drives
   /// the predefined Unicode-CMap extraction path.
   std::string cid_encoding_name;
+  /// The composite font's `/Encoding` when it is an *embedded* CMap stream
+  /// (code -> CID plus codespace, ISO 32000-1 9.7.5.3); empty for a predefined
+  /// name (above) or a simple font. When it declares a CID map, `codes()`
+  /// yields CIDs through it — so a producer that mixes a 1-byte code (e.g. a
+  /// space) among 2-byte CIDs selects the right glyph and advance; when it
+  /// declares a codespace, that codespace (authoritative over `/ToUnicode`'s)
+  /// splits the code string.
+  CMap cid_encoding;
 
   /// Simple-font glyph metrics (ISO 32000-1 9.2.4): `/Widths`, in glyph space
   /// (1/1000 text-space units), indexed by `code - first_char`; codes outside
@@ -362,10 +418,29 @@ struct Font final : Element {
   /// `Identity-H/V` and common CID case), 1 for simple fonts.
   [[nodiscard]] int code_byte_width() const { return composite ? 2 : 1; }
 
-  /// View `codes` as a sequence of character codes split per
-  /// `code_byte_width()`. The result borrows `codes`; do not outlive it.
+  /// View `codes` as a sequence of character codes, each yielded as the *CID*
+  /// it selects (identity when there is no CID map — the common case, and what
+  /// the glyph/advance paths expect). The result borrows `codes`; do not
+  /// outlive it.
+  ///
+  /// A composite font splits (and maps) by its embedded CID `/Encoding` CMap
+  /// when present — that CMap declares both the codespace and the code -> CID
+  /// map, so a producer mixing a 1-byte code among 2-byte CIDs (PDFClown et
+  /// al.) stays aligned and selects the right glyph. Failing that it splits by
+  /// the `ToUnicode` codespace (CIDs identity — the same split `to_unicode`
+  /// uses, keeping glyph/advance/Unicode consistent), and failing that by a
+  /// fixed 2 bytes (the common `Identity-H` case). Simple fonts split by a
+  /// fixed 1 byte with identity CIDs.
   [[nodiscard]] CodeRange codes(std::string_view codes) const {
-    return {codes, static_cast<std::size_t>(code_byte_width())};
+    const auto width = static_cast<std::size_t>(code_byte_width());
+    if (!composite) {
+      return {codes, width, nullptr, nullptr};
+    }
+    const CMap *cid_map = cid_encoding.has_cid_map() ? &cid_encoding : nullptr;
+    const CMap *codespace = cid_encoding.has_codespace() ? &cid_encoding
+                            : cmap.has_codespace()       ? &cmap
+                                                         : nullptr;
+    return {codes, width, codespace, cid_map};
   }
 
   /// Embedded font: the SFNT parsed from `/FontFile2` (a simple TrueType font,
diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp
index c5239ea8..f31fd64e 100644
--- a/src/odr/internal/pdf/pdf_document_parser.cpp
+++ b/src/odr/internal/pdf/pdf_document_parser.cpp
@@ -460,13 +460,26 @@ void parse_composite_font(DocumentParser &parser, const Dictionary &dictionary,
   font.composite = true;
 
   // The Type0 `/Encoding`: a predefined CMap name (`Identity-H`,
-  // `UniGB-UCS2-H`, …) or an embedded CMap stream. Record the name; the stream
-  // case is left empty (deferred). Drives the predefined Unicode-CMap path in
-  // `Font::to_unicode`.
+  // `UniGB-UCS2-H`, …) or an embedded CMap stream. A name drives the predefined
+  // Unicode-CMap path in `Font::to_unicode`; a stream is parsed into a
+  // code -> CID CMap (`Font::cid_encoding`) so `codes()` yields the right CIDs
+  // for a mixed-width encoding (ISO 32000-1 9.7.5.3).
   if (dictionary.has_key("Encoding")) {
     const Object encoding = parser.resolve_object_copy(dictionary["Encoding"]);
     if (encoding.is_name()) {
       font.cid_encoding_name = encoding.as_name();
+    } else if (dictionary["Encoding"].is_reference()) {
+      try {
+        const std::string stream =
+            parser.read_decoded_stream(dictionary["Encoding"].as_reference());
+        util::stream::ViewStream ss(stream);
+        CMapParser cmap_parser(ss, parser.logger());
+        font.cid_encoding = cmap_parser.parse_cmap();
+      } catch (const std::exception &e) {
+        ODR_WARNING(
+            parser.logger(),
+            "pdf: failed to read composite /Encoding CMap: " << e.what());
+      }
     }
   }
 

From 156258fde633a9c6ce8894881fcc4b18b2bf9860 Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:30:55 +0200
Subject: [PATCH 3/8] PDF: pin substitute-font baseline with @font-face metric
 overrides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A non-embedded font renders in a local system font whose hhea/OS-2 metrics we
don't control, so under line-height:1 the browser placed the glyph baseline at
that font's ascent rather than the ascent_em we use to compute `top` — dropping
e.g. the 120pt Times title well below the page bottom. Route each substitute
through a generated @font-face ('odr-sN', src: local(...) of the family stack)
carrying ascent-override/descent-override (summing to 1em) and
line-gap-override:0, so the baseline is positioned from our metric. Faces are
deduped by (family, style, ascent); the family stack is kept as a fallback.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/html/pdf_file.cpp | 116 +++++++++++++++++++++++++++--
 src/odr/internal/pdf/AGENTS.md     |  10 +++
 2 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp
index 2622d01e..4089769c 100644
--- a/src/odr/internal/html/pdf_file.cpp
+++ b/src/odr/internal/html/pdf_file.cpp
@@ -142,6 +142,105 @@ std::string font_substitute_declaration(const pdf::FontSubstitute &substitute) {
   return declaration;
 }
 
+/// The `local(...)` sources of a CSS `font-family` stack, dropping the generic
+/// keywords an `@font-face src` cannot name. Returns e.g.
+/// "local('Times New Roman'),local(Times)" for "'Times New Roman',Times,serif",
+/// or "" when the stack names no concrete font (generic-only).
+std::string local_font_sources(const std::string_view css_family) {
+  static constexpr std::array<std::string_view, 6> generics = {
+      "serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui"};
+  std::string src;
+  std::size_t start = 0;
+  while (start <= css_family.size()) {
+    const std::size_t comma = css_family.find(',', start);
+    std::string_view name = css_family.substr(
+        start, comma == std::string_view::npos ? css_family.size() - start
+                                               : comma - start);
+    while (!name.empty() && name.front() == ' ') {
+      name.remove_prefix(1);
+    }
+    while (!name.empty() && name.back() == ' ') {
+      name.remove_suffix(1);
+    }
+    const bool generic =
+        std::find(generics.begin(), generics.end(), name) != generics.end();
+    if (!name.empty() && !generic) {
+      if (!src.empty()) {
+        src += ',';
+      }
+      src += "local(";
+      src += name;
+      src += ')';
+    }
+    if (comma == std::string_view::npos) {
+      break;
+    }
+    start = comma + 1;
+  }
+  return src;
+}
+
+/// Registers one `@font-face` per (substitute family, style, ascent) that
+/// overrides the face's ascent/descent so a glyph's baseline lands exactly at
+/// the `top` `add_position_classes` derives from `ascent_em` — independent of
+/// the metrics of whichever local font actually resolves. Without the override
+/// the browser positions the baseline using the resolved font's own ascent,
+/// which for a large non-embedded run (e.g. a 120pt Times title) drops it well
+/// below the intended baseline.
+class SubstituteFontFaces {
+public:
+  /// The `font-family:...` (plus weight/style) declaration for `substitute`,
+  /// routed through a generated metric-overriding face `'odr-sN'`. Falls back
+  /// to the plain family stack when the stack names no concrete font.
+  std::string declaration(const pdf::FontSubstitute &substitute,
+                          const double ascent_em) {
+    const std::string src = local_font_sources(substitute.css_family);
+    if (src.empty()) {
+      return font_substitute_declaration(substitute);
+    }
+    // ascent-override + descent-override sum to one em, so `line-height:1`
+    // leaves no leading and the baseline sits at exactly `ascent_em` of the em
+    // box. `ascent_em` is clamped to [0.5, 1.2]; the `max` keeps descent
+    // non-negative for the rare ascent > 1 (a slight baseline approximation).
+    const double ascent = ascent_em;
+    const double descent = std::max(0.0, 1.0 - ascent_em);
+    std::ostringstream key;
+    key << src << '|' << substitute.bold << '|' << substitute.italic << '|'
+        << std::llround(ascent * 1000.0);
+    const auto [it, inserted] = m_index_by_key.try_emplace(
+        std::move(key).str(), static_cast<int>(m_faces.size()) + 1);
+    if (inserted) {
+      std::ostringstream face;
+      face << "@font-face{font-family:'odr-s" << it->second << "';src:" << src
+           << ";ascent-override:" << round2(ascent * 100.0)
+           << "%;descent-override:" << round2(descent * 100.0)
+           << "%;line-gap-override:0%}";
+      m_faces.push_back(std::move(face).str());
+    }
+    std::string declaration = "font-family:'odr-s" +
+                              std::to_string(it->second) + "'," +
+                              substitute.css_family;
+    if (substitute.bold) {
+      declaration += ";font-weight:bold";
+    }
+    if (substitute.italic) {
+      declaration += ";font-style:italic";
+    }
+    return declaration;
+  }
+
+  /// Appends the collected `@font-face` rules to `out`.
+  void append_faces(std::string &out) const {
+    for (const std::string &face : m_faces) {
+      out += face;
+    }
+  }
+
+private:
+  std::map<std::string, int> m_index_by_key;
+  std::vector<std::string> m_faces;
+};
+
 /// Build an SVG `d` attribute from a path's subpaths, each point mapped through
 /// `to_box` (PDF user space -> the page box, y-down). Lines become `L`, cubic
 /// Béziers `C`, and an explicitly closed subpath ends with `Z`.
@@ -926,6 +1025,7 @@ class HtmlServiceImpl final : public HtmlService {
     std::uint32_t family_count = 0;
     std::string font_faces;
     std::string font_styles; // per-font `.fvN` (visible) / `.fnN` (invisible)
+    SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces
     std::vector<const pdf::Font *> accepted_fonts;
     // Which classes are used: [0]=fv (visible), [1]=fn (invisible).
     std::vector<std::array<bool, 2>> font_class_used;
@@ -1072,9 +1172,12 @@ class HtmlServiceImpl final : public HtmlService {
             run_classes += font_class(font_class_used, font, invisible);
           } else if (text.font != nullptr && text.font->substitute) {
             // Non-embedded font: render the real Unicode in the substitute
-            // family (embedded fonts carry the family in `font_class`).
-            add_class(run_classes, "ff",
-                      font_substitute_declaration(*text.font->substitute));
+            // family (embedded fonts carry the family in `font_class`). The
+            // metric-overriding face pins the baseline to `asc` (see
+            // `SubstituteFontFaces`).
+            add_class(
+                run_classes, "ff",
+                substitute_faces.declaration(*text.font->substitute, asc));
           }
           if (vis_margin_pt != 0) {
             add_class(run_classes, "ml", pt_decl("margin-left", vis_margin_pt));
@@ -1242,6 +1345,7 @@ class HtmlServiceImpl final : public HtmlService {
       write_font_face(*accepted_fonts[i], i, {}, font_class_used[i], font_faces,
                       font_styles);
     }
+    substitute_faces.append_faces(font_faces);
 
     // Write HTML.
     write_header_common(out, font_faces, font_styles, styles, [&] {
@@ -1448,7 +1552,8 @@ class HtmlServiceImpl final : public HtmlService {
 
     std::uint32_t family_count = 0;
     std::string font_faces;
-    std::string font_styles; // ".fvN{...}" / ".fnN{...}"
+    std::string font_styles;              // ".fvN{...}" / ".fnN{...}"
+    SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces
     std::vector<pdf::Font *> accepted_fonts;
     // Per-font, per-uchar, per-glyph occurrence count (pre-pass).
     // Indexed by font_index - 1.
@@ -1661,7 +1766,7 @@ class HtmlServiceImpl final : public HtmlService {
         const std::string substitute_declaration =
             (font == 0 && !invisible && text.font != nullptr &&
              text.font->substitute)
-                ? font_substitute_declaration(*text.font->substitute)
+                ? substitute_faces.declaration(*text.font->substitute, asc)
                 : std::string();
         std::ostringstream fk;
         fk << font << '|' << invisible << '|' << font_size_pt << '|' << cs_pt
@@ -1742,6 +1847,7 @@ class HtmlServiceImpl final : public HtmlService {
       write_font_face(*accepted_fonts[i], i, used_unicode[i],
                       font_class_used[i], font_faces, font_styles);
     }
+    substitute_faces.append_faces(font_faces);
 
     // ---- Pass 2: write HTML ---------------------------------------------
     write_header_common(out, font_faces, font_styles, styles, [&] {
diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md
index 08e90aef..57747121 100644
--- a/src/odr/internal/pdf/AGENTS.md
+++ b/src/odr/internal/pdf/AGENTS.md
@@ -236,6 +236,16 @@ inline SVG per page. Experimental and not production-quality.
     `OS/2`/`hhea` (`font/cff_transform.cpp` `serialize_os2`/`serialize_hhea`; the
     SFNT path passes the originals through), so our ascent can match the
     browser's.
+  - **Non-embedded substitutes** render in a *local* system font whose
+    `hhea`/`OS/2` we do **not** control, so the box-top→baseline distance would
+    be that font's ascent, not our `ascent_em` — dropping e.g. a 120pt Times
+    title well below its intended baseline. `SubstituteFontFaces` (in
+    `pdf_file.cpp`) closes this by routing each substitute through a generated
+    `@font-face` (`'odr-sN'`, `src: local(...)` of the family stack) carrying
+    `ascent-override:ascent_em`, `descent-override:1−ascent_em`,
+    `line-gap-override:0` — so the browser positions the baseline from *our*
+    metric. Faces are deduped by (family, style, ascent); the family stack is
+    kept after `'odr-sN'` as a fallback for the rare unresolved local.
   - **`ascent_em`** (in `pdf_file.cpp`): FontDescriptor `/Ascent`, else the
     embedded font's `bounding_box().y_max / units_per_em()`, else `0.8` em (which
     matches `serialize_os2`'s degenerate 0.8/0.2 fallback, so the fallback font

From 716995dea737e2eb12f729ab74d663133750fa91 Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:41:36 +0200
Subject: [PATCH 4/8] PDF: log unknown graphics operators via Logger instead of
 std::cerr

Thread a Logger into GraphicsOperatorParser (mirroring CMapParser) and
emit ODR_DEBUG for unrecognized operators rather than writing to
std::cerr.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/pdf_graphics_operator_parser.cpp | 11 +++++++----
 src/odr/internal/pdf/pdf_graphics_operator_parser.hpp |  6 +++++-
 src/odr/internal/pdf/pdf_page_extractor.cpp           |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp
index 65eab8aa..12be34af 100644
--- a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp
+++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp
@@ -3,8 +3,9 @@
 #include <odr/internal/pdf/pdf_graphics_operator.hpp>
 #include <odr/internal/util/map_util.hpp>
 
+#include <odr/logger.hpp>
+
 #include <cstdint>
-#include <iostream>
 #include <optional>
 #include <unordered_map>
 
@@ -193,8 +194,9 @@ using char_type = std::streambuf::char_type;
 using int_type = std::streambuf::int_type;
 static constexpr int_type eof = std::streambuf::traits_type::eof();
 
-GraphicsOperatorParser::GraphicsOperatorParser(std::istream &in)
-    : m_parser(in) {}
+GraphicsOperatorParser::GraphicsOperatorParser(std::istream &in,
+                                               const Logger &logger)
+    : m_parser(in), m_logger{&logger} {}
 
 std::istream &GraphicsOperatorParser::in() { return m_parser.in(); }
 
@@ -255,7 +257,8 @@ GraphicsOperator GraphicsOperatorParser::read_operator() {
 
   result.type = operator_name_to_type(operator_name);
   if (result.type == GraphicsOperatorType::unknown) {
-    std::cerr << "unknown operator: " << operator_name << '\n';
+    ODR_DEBUG(*m_logger,
+              "pdf: unknown graphics operator '" + operator_name + "'");
   }
 
   // `BI <key val …> ID <bytes> EI` is one inline image (8.9.7). The key/value
diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp
index e8a905d5..200445d9 100644
--- a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp
+++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp
@@ -2,6 +2,8 @@
 
 #include <odr/internal/pdf/pdf_object_parser.hpp>
 
+#include <odr/logger.hpp>
+
 #include <string>
 #include <vector>
 
@@ -13,7 +15,8 @@ struct GraphicsOperator;
 
 class GraphicsOperatorParser {
 public:
-  explicit GraphicsOperatorParser(std::istream &);
+  explicit GraphicsOperatorParser(std::istream &,
+                                  const Logger &logger = Logger::null());
 
   [[nodiscard]] std::istream &in();
   [[nodiscard]] std::streambuf &sb();
@@ -36,6 +39,7 @@ class GraphicsOperatorParser {
   read_inline_image_data(const Dictionary &dictionary);
 
   ObjectParser m_parser;
+  const Logger *m_logger{nullptr};
 };
 
 } // namespace odr::internal::pdf
diff --git a/src/odr/internal/pdf/pdf_page_extractor.cpp b/src/odr/internal/pdf/pdf_page_extractor.cpp
index 8541b388..333157cc 100644
--- a/src/odr/internal/pdf/pdf_page_extractor.cpp
+++ b/src/odr/internal/pdf/pdf_page_extractor.cpp
@@ -1005,7 +1005,7 @@ void run_content(const std::string &content, const Resources &resources,
                  ActiveForms &active, MarkedContentStack &marked,
                  std::optional<Pen> &pen) {
   std::istringstream ss(content);
-  GraphicsOperatorParser parser(ss);
+  GraphicsOperatorParser parser(ss, logger);
 
   // Route a shown string through the Type3 char-proc renderer or the normal
   // text path, by the font kind.

From 78c68aacfead6ab77f10861b68bd10bd4f5250ad Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:51:55 +0200
Subject: [PATCH 5/8] PDF: don't trust an embedded CMap's codespace when it
 uses usecmap

An embedded Type0 /Encoding CMap that inherits a base via `usecmap`
(e.g. `/Identity-H usecmap`) and then declares only a partial local
codespacerange would have its partial codespace treated as
authoritative by Font::codes(), so inherited 2-byte codes fell through
code_width() to the 1-byte fallback and were mis-split.

We don't resolve the inherited base CMap, so flag the `usecmap` operator
at parse time and exclude an inherited codespace from has_codespace().
Callers then fall back to the ToUnicode codespace / fixed 2-byte width
(the pre-CID-encoding behavior). Local cidchar/cidrange overrides are
still applied; unmapped codes fall through to identity, correct for an
Identity-H base.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/pdf_cmap.hpp        | 25 ++++++++++++++++---
 src/odr/internal/pdf/pdf_cmap_parser.cpp | 11 +++++++++
 test/src/internal/pdf/pdf_cmap.cpp       | 31 ++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp
index 2dbc62bd..5131ce1b 100644
--- a/src/odr/internal/pdf/pdf_cmap.hpp
+++ b/src/odr/internal/pdf/pdf_cmap.hpp
@@ -34,11 +34,27 @@ class CMap {
   /// `ToUnicode` CMap); the caller then falls back to the `/Encoding`.
   [[nodiscard]] bool empty() const { return m_map.empty(); }
 
-  /// True when at least one codespace range was declared. When true, the code
-  /// widths this CMap implies are authoritative for splitting a code string
-  /// (see `code_width`); when false, callers fall back to a fixed width.
+  /// Records that the CMap stream referenced another CMap via `usecmap` (ISO
+  /// 32000-1 9.7.5.3). We do not resolve the inherited base, so whatever
+  /// codespace this stream declares locally is (potentially) incomplete and is
+  /// no longer treated as authoritative (see `has_codespace`).
+  void mark_inherits_external_cmap() { m_inherits_external_cmap = true; }
+
+  /// True when this stream referenced an unresolved base CMap via `usecmap`.
+  [[nodiscard]] bool inherits_external_cmap() const {
+    return m_inherits_external_cmap;
+  }
+
+  /// True when this CMap declares an authoritative codespace: at least one
+  /// range, and it does not inherit an unresolved base CMap via `usecmap`. When
+  /// true, the code widths this CMap implies are authoritative for splitting a
+  /// code string (see `code_width`); when false, callers fall back to another
+  /// CMap's codespace or a fixed width. An inherited (`usecmap`) codespace is
+  /// deliberately excluded — its local ranges may cover only an override
+  /// subset, so trusting them would mis-split the inherited (e.g. 2-byte)
+  /// codes.
   [[nodiscard]] bool has_codespace() const {
-    return !m_codespace_ranges.empty();
+    return !m_codespace_ranges.empty() && !m_inherits_external_cmap;
   }
 
   /// Byte width of a code whose first byte is `first`, decided by the codespace
@@ -81,6 +97,7 @@ class CMap {
     std::size_t width;      // byte width of the codes (to disambiguate widths)
   };
 
+  bool m_inherits_external_cmap{false};
   std::vector<CodespaceRange> m_codespace_ranges;
   std::unordered_map<std::string, std::u16string> m_map;
   std::unordered_map<std::string, std::uint32_t> m_cid_chars;
diff --git a/src/odr/internal/pdf/pdf_cmap_parser.cpp b/src/odr/internal/pdf/pdf_cmap_parser.cpp
index 24c01721..a34ee9e5 100644
--- a/src/odr/internal/pdf/pdf_cmap_parser.cpp
+++ b/src/odr/internal/pdf/pdf_cmap_parser.cpp
@@ -309,6 +309,17 @@ CMap CMapParser::parse_cmap() {
         read_cidchar(last_int, cmap);
       } else if (command == "begincidrange") {
         read_cidrange(last_int, cmap);
+      } else if (command == "usecmap") {
+        // `/BaseCMap usecmap`: this stream inherits another CMap's codespace
+        // and mappings (ISO 32000-1 9.7.5.3). We do not resolve the base, so
+        // any codespace declared locally is (potentially) an override-only
+        // subset; flag it so it is no longer treated as authoritative and
+        // callers fall back to the ToUnicode codespace / fixed width instead of
+        // mis-splitting the inherited (e.g. 2-byte) codes.
+        cmap.mark_inherits_external_cmap();
+        ODR_WARNING(*m_logger,
+                    "pdf: unresolved 'usecmap'; local codespace not treated as "
+                    "authoritative");
       }
     }
   }
diff --git a/test/src/internal/pdf/pdf_cmap.cpp b/test/src/internal/pdf/pdf_cmap.cpp
index b9a71406..137c6cec 100644
--- a/test/src/internal/pdf/pdf_cmap.cpp
+++ b/test/src/internal/pdf/pdf_cmap.cpp
@@ -4,6 +4,7 @@
 
 #include <sstream>
 #include <string>
+#include <string_view>
 
 #include <gtest/gtest.h>
 
@@ -168,3 +169,33 @@ TEST(PdfCMap, mixed_code_widths) {
 
   EXPECT_EQ(cmap.translate_string(std::string("\x01\x81\x41", 3)), "AB");
 }
+
+TEST(PdfCMap, codespace_is_authoritative_without_usecmap) {
+  CMap cmap = parse("1 begincodespacerange\n"
+                    "<0000> <FFFF>\n"
+                    "endcodespacerange\n");
+
+  EXPECT_FALSE(cmap.inherits_external_cmap());
+  EXPECT_TRUE(cmap.has_codespace());
+  EXPECT_EQ(cmap.code_width(0x00), 2);
+}
+
+TEST(PdfCMap, usecmap_disables_local_codespace_authority) {
+  // A stream that inherits `Identity-H` (2-byte codespace) via `usecmap` and
+  // only declares a 1-byte override codespace must not be trusted to split
+  // codes, or the inherited 2-byte codes would be mis-split as single bytes.
+  CMap cmap = parse("/Identity-H usecmap\n"
+                    "1 begincodespacerange\n"
+                    "<20> <20>\n"
+                    "endcodespacerange\n"
+                    "1 begincidchar\n"
+                    "<20> 1\n"
+                    "endcidchar\n");
+
+  EXPECT_TRUE(cmap.inherits_external_cmap());
+  // The partial local codespace is no longer authoritative.
+  EXPECT_FALSE(cmap.has_codespace());
+  // The local CID override is still applied.
+  EXPECT_TRUE(cmap.has_cid_map());
+  EXPECT_EQ(cmap.cid_for_code(std::string_view("\x20", 1)), 1u);
+}

From 495eb2b6226e4059713001503383f730fd40ff30 Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sat, 4 Jul 2026 23:52:30 +0200
Subject: [PATCH 6/8] update refs

---
 test/data/reference-output/odr-private | 2 +-
 test/data/reference-output/odr-public  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private
index 3fc6efc6..429ca6ac 160000
--- a/test/data/reference-output/odr-private
+++ b/test/data/reference-output/odr-private
@@ -1 +1 @@
-Subproject commit 3fc6efc6a2ded86e4ea9baab5bac2c59086bdfcd
+Subproject commit 429ca6ac35880a35c138e5bad4505e8b79be685d
diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public
index 2f03cf25..51f96987 160000
--- a/test/data/reference-output/odr-public
+++ b/test/data/reference-output/odr-public
@@ -1 +1 @@
-Subproject commit 2f03cf258064e92ea38b8f0811d5ce97dc2faafe
+Subproject commit 51f969870b99b58ff807471b60f01f7668b43578

From 2cbe09292f07ba071097bfe40a511a1b47a7cf01 Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sun, 5 Jul 2026 00:10:40 +0200
Subject: [PATCH 7/8] PDF: use doc-comment style for CidRange field comments

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/pdf_cmap.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp
index 5131ce1b..5117aad1 100644
--- a/src/odr/internal/pdf/pdf_cmap.hpp
+++ b/src/odr/internal/pdf/pdf_cmap.hpp
@@ -91,10 +91,10 @@ class CMap {
   };
 
   struct CidRange {
-    std::uint32_t low;      // numeric value of the low code
-    std::uint32_t high;     // numeric value of the high code
-    std::uint32_t base_cid; // CID of the low code
-    std::size_t width;      // byte width of the codes (to disambiguate widths)
+    std::uint32_t low;      ///< numeric value of the low code
+    std::uint32_t high;     ///< numeric value of the high code
+    std::uint32_t base_cid; ///< CID of the low code
+    std::size_t width; ///< byte width of the codes (to disambiguate widths)
   };
 
   bool m_inherits_external_cmap{false};

From ff613ef7517c98f6980c60166c65386d2e5d9eff Mon Sep 17 00:00:00 2001
From: Andreas Stefl <stefl.andreas@gmail.com>
Date: Sun, 5 Jul 2026 00:10:40 +0200
Subject: [PATCH 8/8] PDF: represent CodeRange::Iterator cursor as a shrinking
 string_view

Replace the paired `const char *position, const char *end` cursor with a
single `string_view` that shrinks as codes are consumed. Bundling the two
pointers removes the mismatched-position/end footgun and lets `settle()`
use `size()` directly instead of pointer subtraction. `begin()`/`end()` now
pass the view (and an empty view at its tail) rather than reaching into
`data()`, dropping both suspicious-stringview-data-usage suppressions.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/odr/internal/pdf/pdf_document_element.hpp | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp
index 5a04d029..3bc97ca8 100644
--- a/src/odr/internal/pdf/pdf_document_element.hpp
+++ b/src/odr/internal/pdf/pdf_document_element.hpp
@@ -265,10 +265,10 @@ class CodeRange {
     using reference = std::uint32_t;
 
     Iterator() = default;
-    Iterator(const char *position, const char *end, const std::size_t width,
+    Iterator(const std::string_view range, const std::size_t width,
              const CMap *codespace, const CMap *cid_map)
-        : m_position{position}, m_end{end}, m_fixed_width{width},
-          m_codespace{codespace}, m_cid_map{cid_map} {
+        : m_range{range}, m_fixed_width{width}, m_codespace{codespace},
+          m_cid_map{cid_map} {
       settle();
     }
 
@@ -278,20 +278,20 @@ class CodeRange {
       // is the code itself (`Identity-H/V`), as it is for simple fonts.
       if (m_cid_map != nullptr) {
         if (const std::optional<std::uint32_t> cid =
-                m_cid_map->cid_for_code(std::string_view(m_position, m_width));
+                m_cid_map->cid_for_code(m_range.substr(0, m_width));
             cid.has_value()) {
           return *cid;
         }
       }
       std::uint32_t code = 0;
       for (std::size_t k = 0; k < m_width; ++k) {
-        code = (code << 8) | static_cast<unsigned char>(m_position[k]);
+        code = (code << 8) | static_cast<unsigned char>(m_range[k]);
       }
       return code;
     }
 
     Iterator &operator++() {
-      m_position += m_width;
+      m_range.remove_prefix(m_width);
       settle();
       return *this;
     }
@@ -302,34 +302,32 @@ class CodeRange {
     }
 
     bool operator==(const Iterator &other) const {
-      return m_position == other.m_position;
+      return m_range.data() == other.m_range.data();
     }
 
   private:
-    // Fix the width of the code at `m_position`, dropping a trailing partial
-    // code (fewer bytes than its declared width) by clamping to `m_end` so the
-    // iterator lands exactly on the end sentinel.
+    /// Fix the width of the code at the front of `m_range`, dropping a trailing
+    /// partial code (fewer bytes than its declared width) by consuming the rest
+    /// of the range so the iterator lands exactly on the end sentinel.
     void settle() {
-      if (m_position >= m_end) {
-        m_position = m_end;
+      if (m_range.empty()) {
         m_width = 0;
         return;
       }
-      const auto remaining = static_cast<std::size_t>(m_end - m_position);
       std::size_t width = m_fixed_width;
       if (m_codespace != nullptr && m_codespace->has_codespace()) {
-        width = m_codespace->code_width(static_cast<std::uint8_t>(*m_position));
+        width =
+            m_codespace->code_width(static_cast<std::uint8_t>(m_range.front()));
       }
-      if (width == 0 || width > remaining) {
-        m_position = m_end; // drop the trailing partial code
+      if (width == 0 || width > m_range.size()) {
+        m_range.remove_prefix(m_range.size()); // drop the trailing partial code
         m_width = 0;
         return;
       }
       m_width = width;
     }
 
-    const char *m_position{nullptr};
-    const char *m_end{nullptr};
+    std::string_view m_range;
     std::size_t m_fixed_width{1};
     const CMap *m_codespace{nullptr};
     const CMap *m_cid_map{nullptr};
@@ -341,17 +339,14 @@ class CodeRange {
       : m_codes{codes}, m_width{width}, m_codespace{codespace},
         m_cid_map{cid_map} {}
 
-  // `data()` is used as a bounded byte range delimited by `end()`, not as a
-  // null-terminated string; the suspicious-data-usage check does not apply.
   [[nodiscard]] Iterator begin() const {
-    return {m_codes.data(), // NOLINT(bugprone-suspicious-stringview-data-usage)
-            m_codes.data() + m_codes.size(), m_width, m_codespace, m_cid_map};
+    return {m_codes, m_width, m_codespace, m_cid_map};
   }
   [[nodiscard]] Iterator end() const {
-    const char *stop =
-        m_codes.data() + // NOLINT(bugprone-suspicious-stringview-data-usage)
-        m_codes.size();
-    return {stop, stop, m_width, m_codespace, m_cid_map};
+    // An empty view positioned at the end of the codes: the exhausted `begin()`
+    // iterator consumes down to this same one-past-the-end pointer, so the two
+    // compare equal (`operator==` matches on `data()`).
+    return {m_codes.substr(m_codes.size()), m_width, m_codespace, m_cid_map};
   }
 
 private: