Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 111 additions & 5 deletions src/odr/internal/html/pdf_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,105 @@ std::string font_substitute_declaration(const pdf::FontSubstitute &substitute) {
return declaration;
}

/// The `local(...)` sources of a CSS `font-family` stack, dropping the generic
/// keywords an `@font-face src` cannot name. Returns e.g.
/// "local('Times New Roman'),local(Times)" for "'Times New Roman',Times,serif",
/// or "" when the stack names no concrete font (generic-only).
std::string local_font_sources(const std::string_view css_family) {
static constexpr std::array<std::string_view, 6> generics = {
"serif", "sans-serif", "monospace", "cursive", "fantasy", "system-ui"};
std::string src;
std::size_t start = 0;
while (start <= css_family.size()) {
const std::size_t comma = css_family.find(',', start);
std::string_view name = css_family.substr(
start, comma == std::string_view::npos ? css_family.size() - start
: comma - start);
while (!name.empty() && name.front() == ' ') {
name.remove_prefix(1);
}
while (!name.empty() && name.back() == ' ') {
name.remove_suffix(1);
}
const bool generic =
std::find(generics.begin(), generics.end(), name) != generics.end();
if (!name.empty() && !generic) {
if (!src.empty()) {
src += ',';
}
src += "local(";
src += name;
src += ')';
}
if (comma == std::string_view::npos) {
break;
}
start = comma + 1;
}
return src;
}

/// Registers one `@font-face` per (substitute family, style, ascent) that
/// overrides the face's ascent/descent so a glyph's baseline lands exactly at
/// the `top` `add_position_classes` derives from `ascent_em` — independent of
/// the metrics of whichever local font actually resolves. Without the override
/// the browser positions the baseline using the resolved font's own ascent,
/// which for a large non-embedded run (e.g. a 120pt Times title) drops it well
/// below the intended baseline.
class SubstituteFontFaces {
public:
/// The `font-family:...` (plus weight/style) declaration for `substitute`,
/// routed through a generated metric-overriding face `'odr-sN'`. Falls back
/// to the plain family stack when the stack names no concrete font.
std::string declaration(const pdf::FontSubstitute &substitute,
const double ascent_em) {
const std::string src = local_font_sources(substitute.css_family);
if (src.empty()) {
return font_substitute_declaration(substitute);
}
// ascent-override + descent-override sum to one em, so `line-height:1`
// leaves no leading and the baseline sits at exactly `ascent_em` of the em
// box. `ascent_em` is clamped to [0.5, 1.2]; the `max` keeps descent
// non-negative for the rare ascent > 1 (a slight baseline approximation).
const double ascent = ascent_em;
const double descent = std::max(0.0, 1.0 - ascent_em);
std::ostringstream key;
key << src << '|' << substitute.bold << '|' << substitute.italic << '|'
<< std::llround(ascent * 1000.0);
const auto [it, inserted] = m_index_by_key.try_emplace(
std::move(key).str(), static_cast<int>(m_faces.size()) + 1);
if (inserted) {
std::ostringstream face;
face << "@font-face{font-family:'odr-s" << it->second << "';src:" << src
<< ";ascent-override:" << round2(ascent * 100.0)
<< "%;descent-override:" << round2(descent * 100.0)
<< "%;line-gap-override:0%}";
m_faces.push_back(std::move(face).str());
}
std::string declaration = "font-family:'odr-s" +
std::to_string(it->second) + "'," +
substitute.css_family;
if (substitute.bold) {
declaration += ";font-weight:bold";
}
if (substitute.italic) {
declaration += ";font-style:italic";
}
return declaration;
}

/// Appends the collected `@font-face` rules to `out`.
void append_faces(std::string &out) const {
for (const std::string &face : m_faces) {
out += face;
}
}

private:
std::map<std::string, int> m_index_by_key;
std::vector<std::string> m_faces;
};

/// Build an SVG `d` attribute from a path's subpaths, each point mapped through
/// `to_box` (PDF user space -> the page box, y-down). Lines become `L`, cubic
/// Béziers `C`, and an explicitly closed subpath ends with `Z`.
Expand Down Expand Up @@ -926,6 +1025,7 @@ class HtmlServiceImpl final : public HtmlService {
std::uint32_t family_count = 0;
std::string font_faces;
std::string font_styles; // per-font `.fvN` (visible) / `.fnN` (invisible)
SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces
std::vector<const pdf::Font *> accepted_fonts;
// Which classes are used: [0]=fv (visible), [1]=fn (invisible).
std::vector<std::array<bool, 2>> font_class_used;
Expand Down Expand Up @@ -1072,9 +1172,12 @@ class HtmlServiceImpl final : public HtmlService {
run_classes += font_class(font_class_used, font, invisible);
} else if (text.font != nullptr && text.font->substitute) {
// Non-embedded font: render the real Unicode in the substitute
// family (embedded fonts carry the family in `font_class`).
add_class(run_classes, "ff",
font_substitute_declaration(*text.font->substitute));
// family (embedded fonts carry the family in `font_class`). The
// metric-overriding face pins the baseline to `asc` (see
// `SubstituteFontFaces`).
add_class(
run_classes, "ff",
substitute_faces.declaration(*text.font->substitute, asc));
}
if (vis_margin_pt != 0) {
add_class(run_classes, "ml", pt_decl("margin-left", vis_margin_pt));
Expand Down Expand Up @@ -1242,6 +1345,7 @@ class HtmlServiceImpl final : public HtmlService {
write_font_face(*accepted_fonts[i], i, {}, font_class_used[i], font_faces,
font_styles);
}
substitute_faces.append_faces(font_faces);

// Write HTML.
write_header_common(out, font_faces, font_styles, styles, [&] {
Expand Down Expand Up @@ -1448,7 +1552,8 @@ class HtmlServiceImpl final : public HtmlService {

std::uint32_t family_count = 0;
std::string font_faces;
std::string font_styles; // ".fvN{...}" / ".fnN{...}"
std::string font_styles; // ".fvN{...}" / ".fnN{...}"
SubstituteFontFaces substitute_faces; // metric-overriding `.odr-sN` faces
std::vector<pdf::Font *> accepted_fonts;
// Per-font, per-uchar, per-glyph occurrence count (pre-pass).
// Indexed by font_index - 1.
Expand Down Expand Up @@ -1661,7 +1766,7 @@ class HtmlServiceImpl final : public HtmlService {
const std::string substitute_declaration =
(font == 0 && !invisible && text.font != nullptr &&
text.font->substitute)
? font_substitute_declaration(*text.font->substitute)
? substitute_faces.declaration(*text.font->substitute, asc)
: std::string();
std::ostringstream fk;
fk << font << '|' << invisible << '|' << font_size_pt << '|' << cs_pt
Expand Down Expand Up @@ -1742,6 +1847,7 @@ class HtmlServiceImpl final : public HtmlService {
write_font_face(*accepted_fonts[i], i, used_unicode[i],
font_class_used[i], font_faces, font_styles);
}
substitute_faces.append_faces(font_faces);

// ---- Pass 2: write HTML ---------------------------------------------
write_header_common(out, font_faces, font_styles, styles, [&] {
Expand Down
29 changes: 23 additions & 6 deletions src/odr/internal/pdf/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ inline SVG per page. Experimental and not production-quality.
(`DecoderEngine::odr`); `is_decodable()` returns `false` and `file_meta()`
carries only the file type. All parsing is lazy, on HTML request.
- **Object syntax**: null, booleans, integers/reals, names (incl. `#xx`
escapes), literal strings (`\` and `\ooo` escapes), hex strings, arrays,
dictionaries, indirect references (`n g R`) — standalone and nested.
escapes), literal strings (the `\n \r \t \b \f` control escapes, `\ddd`
octal, escaped delimiters, and `\`-before-EOL line continuation — Table 3),
hex strings, arrays, dictionaries, indirect references (`n g R`) —
standalone and nested.
- **File structure**: header, `n g obj … endobj`, `stream` payloads (via
`/Length`, with a scan-to-`endstream` fallback), classic `xref` tables,
`trailer`, `startxref`, `%%EOF`; both sequential reading (`read_entry`) and
Expand Down Expand Up @@ -103,7 +105,11 @@ inline SVG per page. Experimental and not production-quality.
(Type0) fonts** are recognized: the descendant CIDFont's
`/CIDSystemInfo` `/Registry`/`/Ordering` is recorded on the `Font`, and the
Type0 `/Encoding` (a code → CID CMap such as `Identity-H`) is kept out of the
simple-font encoding path. Extraction is driven by the `/ToUnicode` CMap (the
simple-font encoding path. An *embedded* `/Encoding` CMap stream is parsed
(`cidchar`/`cidrange` → `Font::cid_encoding`) so `Font::codes()` yields CIDs
through it — this also carries the authoritative codespace, so a producer that
mixes a 1-byte code (e.g. a space) among 2-byte CIDs stays aligned and selects
the right glyph/advance. Extraction is driven by the `/ToUnicode` CMap (the
common case — every Type0 font in the corpus carries one). When a composite
font has no `/ToUnicode`, a **predefined Unicode `/Encoding`** — the
`Uni*-UCS2/UTF16/UTF32` CMaps — is decoded directly (`pdf_cid`), since those
Expand Down Expand Up @@ -230,6 +236,16 @@ inline SVG per page. Experimental and not production-quality.
`OS/2`/`hhea` (`font/cff_transform.cpp` `serialize_os2`/`serialize_hhea`; the
SFNT path passes the originals through), so our ascent can match the
browser's.
- **Non-embedded substitutes** render in a *local* system font whose
`hhea`/`OS/2` we do **not** control, so the box-top→baseline distance would
be that font's ascent, not our `ascent_em` — dropping e.g. a 120pt Times
title well below its intended baseline. `SubstituteFontFaces` (in
`pdf_file.cpp`) closes this by routing each substitute through a generated
`@font-face` (`'odr-sN'`, `src: local(...)` of the family stack) carrying
`ascent-override:ascent_em`, `descent-override:1−ascent_em`,
`line-gap-override:0` — so the browser positions the baseline from *our*
metric. Faces are deduped by (family, style, ascent); the family stack is
kept after `'odr-sN'` as a fallback for the rare unresolved local.
- **`ascent_em`** (in `pdf_file.cpp`): FontDescriptor `/Ascent`, else the
embedded font's `bounding_box().y_max / units_per_em()`, else `0.8` em (which
matches `serialize_os2`'s degenerate 0.8/0.2 fallback, so the fallback font
Expand Down Expand Up @@ -565,9 +581,10 @@ the tables land.
there is no `/ToUnicode` (with an unreachable glyph staying unmapped) and a
`/ToUnicode` CMap taking precedence over the reverse map.

No assertion-based coverage of the tokenizer (escapes, references, hex strings)
or the HTML output itself (the span emission / CSS transform mapping, incl. the
dual-layer glyph/Unicode emission).
The tokenizer's string parsing is covered (`PdfObjectParser`: literal-string
control/octal/delimiter/line-continuation escapes and hex strings); references
and the HTML output itself (the span emission / CSS transform mapping, incl. the
dual-layer glyph/Unicode emission) are not yet asserted.

---

Expand Down
38 changes: 35 additions & 3 deletions src/odr/internal/pdf/pdf_cmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,36 @@ void CMap::map_single(std::string code, std::u16string unicode) {
m_map[std::move(code)] = std::move(unicode);
}

std::size_t CMap::code_length(const std::string &codes,
const std::size_t pos) const {
const auto first = static_cast<std::uint8_t>(codes[pos]);
void CMap::map_cid_char(std::string code, const std::uint32_t cid) {
m_cid_chars[std::move(code)] = cid;
}

void CMap::add_cid_range(const std::uint32_t low, const std::uint32_t high,
const std::uint32_t base_cid,
const std::size_t width) {
m_cid_ranges.push_back({low, high, base_cid, width});
}

std::optional<std::uint32_t>
CMap::cid_for_code(const std::string_view code) const {
if (const auto it = m_cid_chars.find(std::string(code));
it != m_cid_chars.end()) {
return it->second;
}
std::uint32_t value = 0;
for (const char c : code) {
value = (value << 8) | static_cast<std::uint8_t>(c);
}
for (const CidRange &range : m_cid_ranges) {
if (range.width == code.size() && value >= range.low &&
value <= range.high) {
return range.base_cid + (value - range.low);
}
}
return std::nullopt;
}

std::size_t CMap::code_width(const std::uint8_t first) const {
for (const CodespaceRange &range : m_codespace_ranges) {
if (first >= static_cast<std::uint8_t>(range.low.front()) &&
first <= static_cast<std::uint8_t>(range.high.front())) {
Expand All @@ -30,6 +57,11 @@ std::size_t CMap::code_length(const std::string &codes,
return 1;
}

std::size_t CMap::code_length(const std::string &codes,
const std::size_t pos) const {
return code_width(static_cast<std::uint8_t>(codes[pos]));
}

std::string CMap::translate_string(const std::string &codes) const {
std::u16string result;

Expand Down
68 changes: 68 additions & 0 deletions src/odr/internal/pdf/pdf_cmap.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#pragma once

#include <cstddef>
#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>

Expand All @@ -18,12 +21,67 @@ class CMap {
void add_codespace_range(std::string low_code, std::string high_code);
void map_single(std::string code, std::u16string unicode);

/// CID mapping (a composite font's `/Encoding` CMap stream, ISO 32000-1
/// 9.7.5.3): `cidchar` maps a single code, `cidrange` maps a contiguous block
/// (`base_cid + (code - low)`). Codes are keyed by their raw big-endian bytes
/// / width so a 1-byte and a 2-byte code with the same numeric value stay
/// distinct (`<20>` -> CID 229 vs `<0020>` -> CID 32), the mixed-width case.
void map_cid_char(std::string code, std::uint32_t cid);
void add_cid_range(std::uint32_t low, std::uint32_t high,
std::uint32_t base_cid, std::size_t width);

/// True when no code -> Unicode mapping was parsed (e.g. the font carries no
/// `ToUnicode` CMap); the caller then falls back to the `/Encoding`.
[[nodiscard]] bool empty() const { return m_map.empty(); }

/// Records that the CMap stream referenced another CMap via `usecmap` (ISO
/// 32000-1 9.7.5.3). We do not resolve the inherited base, so whatever
/// codespace this stream declares locally is (potentially) incomplete and is
/// no longer treated as authoritative (see `has_codespace`).
void mark_inherits_external_cmap() { m_inherits_external_cmap = true; }

/// True when this stream referenced an unresolved base CMap via `usecmap`.
[[nodiscard]] bool inherits_external_cmap() const {
return m_inherits_external_cmap;
}

/// True when this CMap declares an authoritative codespace: at least one
/// range, and it does not inherit an unresolved base CMap via `usecmap`. When
/// true, the code widths this CMap implies are authoritative for splitting a
/// code string (see `code_width`); when false, callers fall back to another
/// CMap's codespace or a fixed width. An inherited (`usecmap`) codespace is
/// deliberately excluded — its local ranges may cover only an override
/// subset, so trusting them would mis-split the inherited (e.g. 2-byte)
/// codes.
[[nodiscard]] bool has_codespace() const {
return !m_codespace_ranges.empty() && !m_inherits_external_cmap;
}

/// Byte width of a code whose first byte is `first`, decided by the codespace
/// ranges (matched on the first byte, ISO 32000-1 9.7.6.2). Falls back to a
/// single byte when no range declares/matches it. This is the variable-width
/// split that `translate_string` uses; exposing it lets the glyph/advance
/// paths split codes identically, so a mixed 1-/2-byte codespace (e.g. a
/// 1-byte space among 2-byte CIDs) stays aligned across both.
[[nodiscard]] std::size_t code_width(std::uint8_t first) const;

[[nodiscard]] std::string translate_string(const std::string &codes) const;

/// True when at least one `cidchar`/`cidrange` mapping was parsed (an
/// embedded CID `/Encoding` CMap). When false the composite code -> CID is
/// identity
/// (`Identity-H/V`).
[[nodiscard]] bool has_cid_map() const {
return !m_cid_chars.empty() || !m_cid_ranges.empty();
}

/// The CID a code (raw big-endian bytes) selects, or `nullopt` when no
/// `cidchar`/`cidrange` covers it; the caller then falls back to identity
/// (CID = code). The width of `code` is matched, so a 1-byte code never hits
/// a 2-byte range.
[[nodiscard]] std::optional<std::uint32_t>
cid_for_code(std::string_view code) const;

private:
struct CodespaceRange {
// `low` and `high` share the same length; that length is the code width in
Expand All @@ -32,8 +90,18 @@ class CMap {
std::string high;
};

struct CidRange {
std::uint32_t low; ///< numeric value of the low code
std::uint32_t high; ///< numeric value of the high code
std::uint32_t base_cid; ///< CID of the low code
std::size_t width; ///< byte width of the codes (to disambiguate widths)
};

bool m_inherits_external_cmap{false};
std::vector<CodespaceRange> m_codespace_ranges;
std::unordered_map<std::string, std::u16string> m_map;
std::unordered_map<std::string, std::uint32_t> m_cid_chars;
std::vector<CidRange> m_cid_ranges;

/// Byte width of the code starting at `pos`, decided by the codespace ranges;
/// falls back to a single byte when no range declares/matches it.
Expand Down
Loading
Loading