Skip to main content

ethos_pdf/
lib.rs

1/*
2 * Copyright 2026 The Ethos maintainers
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//! # ethos-pdf — WS-ENGINE lane (Milestone A)
18//!
19//! The only crate that loads PDFium (invariant 3). Everything crossing
20//! [`EthosPdfBackend`] is already normalized + quantized (invariant 1: quantize-at-
21//! extraction lives here); public schemas/APIs never see PDFium types.
22//!
23//! This first WS-ENGINE slice uses a small dynamic FFI boundary over the PDFium C API.
24//! Runtime loading is explicit through `ETHOS_PDFIUM_LIBRARY_PATH`, so parser output
25//! cannot accidentally depend on an unknown library from a host search path.
26
27#![deny(unsafe_op_in_unsafe_fn)]
28#![warn(missing_docs)]
29
30use std::collections::{BTreeMap, HashSet};
31use std::env;
32use std::ffi::{c_char, c_int, c_ulong, c_void, CString};
33use std::path::{Path, PathBuf};
34use std::ptr;
35use std::slice;
36use std::sync::{Mutex, OnceLock};
37
38use ethos_core::codes::WarningCode;
39use ethos_core::config::{PageSelection, ParseConfig};
40use ethos_core::error::{ErrorCode, EthosError};
41use ethos_core::geom::{quantize, QRect};
42use ethos_core::ids::{page_id, span_id, warning_id};
43use ethos_core::model::{Page, Span, SpanOriginLocator, Warning};
44use ethos_core::traits::{BackendManifest, EthosPdfBackend, Extraction};
45use serde::{Deserialize, Serialize};
46
47/// Environment variable containing the exact PDFium dynamic library path.
48pub const PDFIUM_LIBRARY_PATH_ENV: &str = "ETHOS_PDFIUM_LIBRARY_PATH";
49
50/// Optional environment variable carrying the pinned PDFium release/version string.
51pub const PDFIUM_VERSION_ENV: &str = "ETHOS_PDFIUM_VERSION";
52
53/// Optional environment variable containing the downloaded Phase 1 release artifact path.
54pub const PDFIUM_ARTIFACT_PATH_ENV: &str = "ETHOS_PDFIUM_ARTIFACT_PATH";
55
56/// Profile quantization: 100 quanta per PDF point.
57pub const QUANTUM_PER_POINT: u32 = 100;
58const ORIGIN_LOCATOR_POLICY: &str = "origin-run-locator-v1";
59
60const DETERMINISTIC_PROFILE_JSON: &str = include_str!("../assets/ethos-deterministic-v1.json");
61const FONT_SUBSTITUTION_TABLE_JSON: &str = include_str!("../assets/font-substitution-table.json");
62
63/// PDFium has process-global library state; serialize init/load/destroy for now.
64static PDFIUM_LOCK: Mutex<()> = Mutex::new(());
65static PINNED_PDFIUM_PROFILE: OnceLock<PinnedPdfiumBackend> = OnceLock::new();
66static FONT_SUBSTITUTION_TABLE: OnceLock<FontSubstitutionTable> = OnceLock::new();
67
68/// PDFium backend implementation.
69#[derive(Debug, Clone, Default)]
70pub struct PdfiumBackend {
71    library_path: Option<PathBuf>,
72    artifact_path: Option<PathBuf>,
73    version: Option<String>,
74}
75
76/// Debug-only report of PDFium text geometry signals.
77///
78/// This is not part of the canonical document contract. It exists so Gate Zero
79/// investigations can compare native PDFium geometry sources across platforms
80/// before changing parser output or fingerprint policy.
81#[derive(Debug, Serialize)]
82pub struct GeometryProbeReport {
83    /// Report schema identifier.
84    pub schema_version: String,
85    /// Quantization used for every reported coordinate.
86    pub quantum_per_point: u32,
87    /// Backend manifest for the loaded PDFium runtime.
88    pub backend: BackendManifest,
89    /// Probed pages.
90    pub pages: Vec<GeometryProbePage>,
91}
92
93/// Per-page debug geometry signals.
94#[derive(Debug, Serialize)]
95pub struct GeometryProbePage {
96    /// Canonical page id.
97    pub id: String,
98    /// 1-based original page index.
99    pub index: u32,
100    /// Quantized page width.
101    pub width: i64,
102    /// Quantized page height.
103    pub height: i64,
104    /// Page rotation in degrees.
105    pub rotation: u16,
106    /// PDFium text character count.
107    pub char_count: i32,
108    /// Optional PDFium text symbols available in this runtime.
109    pub symbols: GeometryProbeSymbols,
110    /// Per-character geometry records.
111    pub chars: Vec<GeometryProbeChar>,
112    /// Parser-like text runs with alternative geometry unions.
113    pub runs: Vec<GeometryProbeRun>,
114}
115
116/// Optional PDFium geometry symbols discovered at runtime.
117#[derive(Debug, Serialize)]
118pub struct GeometryProbeSymbols {
119    /// Whether FPDFText_GetCharOrigin is available.
120    pub char_origin: bool,
121    /// Whether FPDFText_GetLooseCharBox is available.
122    pub loose_char_box: bool,
123    /// Whether FPDFText_CountRects and FPDFText_GetRect are available.
124    pub text_rects: bool,
125}
126
127/// Per-character geometry probe record.
128#[derive(Debug, Serialize)]
129pub struct GeometryProbeChar {
130    /// Zero-based PDFium character index.
131    pub index: i32,
132    /// Unicode scalar value reported by PDFium.
133    pub unicode: u32,
134    /// Character as a string when it is a valid scalar value.
135    pub text: Option<String>,
136    /// Why this character would break or be skipped by the parser run builder.
137    pub parser_action: String,
138    /// Current parser-critical FPDFText_GetCharBox geometry.
139    pub char_box: Option<QRect>,
140    /// FPDFText_GetLooseCharBox geometry when the symbol is present.
141    pub loose_char_box: Option<QRect>,
142    /// FPDFText_GetCharOrigin point when the symbol is present.
143    pub char_origin: Option<[i64; 2]>,
144    /// Deterministic font id used by the parser.
145    pub font_id: Option<String>,
146    /// PDFium font descriptor flags used by the parser.
147    pub font_flags: Option<u32>,
148    /// Quantized font size used by the parser.
149    pub font_size_q: Option<i64>,
150}
151
152/// Parser-like text run with alternative PDFium geometry sources.
153#[derive(Debug, Serialize)]
154pub struct GeometryProbeRun {
155    /// One-based run index on this page.
156    pub index: u32,
157    /// Run text after parser skip/break rules.
158    pub text: String,
159    /// First included PDFium character index.
160    pub char_start: i32,
161    /// Exclusive end PDFium character index.
162    pub char_end: i32,
163    /// Included character indices.
164    pub char_indices: Vec<i32>,
165    /// Current parser span bbox: union of FPDFText_GetCharBox records.
166    pub char_box_union: Option<QRect>,
167    /// Union of FPDFText_GetLooseCharBox records when available.
168    pub loose_char_box_union: Option<QRect>,
169    /// Rectangles from FPDFText_CountRects/GetRect for the run range when available.
170    pub text_rects: Vec<QRect>,
171    /// Union of text_rects when available.
172    pub text_rect_union: Option<QRect>,
173    /// Origin of first included character when available.
174    pub first_origin: Option<[i64; 2]>,
175    /// Origin of last included character when available.
176    pub last_origin: Option<[i64; 2]>,
177    /// Deterministic font id used by the parser.
178    pub font_id: Option<String>,
179    /// PDFium font descriptor flags used by the parser.
180    pub font_flags: Option<u32>,
181    /// Quantized font size used by the parser.
182    pub font_size_q: Option<i64>,
183}
184
185/// Raw crop rendered from a PDF page.
186///
187/// This is the pre-encoding renderer boundary used by `ethos-render` work. It
188/// deliberately exposes raw BGRA bytes and a byte hash before PNG/JPEG encoding
189/// is added, so callers can test the renderer itself before artifact encoding.
190#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RawCrop {
192    /// 1-based source page index.
193    pub page_index: u32,
194    /// Source bbox in Ethos quantized top-left coordinates.
195    pub bbox: QRect,
196    /// Crop width in pixels.
197    pub width_px: u32,
198    /// Crop height in pixels.
199    pub height_px: u32,
200    /// Bytes per crop row.
201    pub stride: u32,
202    /// Pixel format for `bytes`.
203    pub pixel_format: &'static str,
204    /// SHA-256 hex digest of `bytes`.
205    pub sha256: String,
206    /// Tightly packed crop bytes.
207    pub bytes: Vec<u8>,
208}
209
210impl PdfiumBackend {
211    /// Construct a backend using an explicit PDFium dynamic library path.
212    pub fn from_library_path(path: impl Into<PathBuf>) -> Self {
213        PdfiumBackend {
214            library_path: Some(path.into()),
215            artifact_path: None,
216            version: None,
217        }
218    }
219
220    /// Add an explicit downloaded PDFium release artifact path for archive-hash verification.
221    pub fn with_artifact_path(mut self, path: impl Into<PathBuf>) -> Self {
222        self.artifact_path = Some(path.into());
223        self
224    }
225
226    /// Construct a backend using an explicit PDFium path and pinned version string.
227    pub fn with_version(mut self, version: impl Into<String>) -> Self {
228        self.version = Some(version.into());
229        self
230    }
231
232    fn configured_library_path(&self) -> Option<PathBuf> {
233        self.library_path
234            .clone()
235            .or_else(|| env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from))
236    }
237
238    fn configured_artifact_path(&self) -> Option<PathBuf> {
239        self.artifact_path
240            .clone()
241            .or_else(|| env::var_os(PDFIUM_ARTIFACT_PATH_ENV).map(PathBuf::from))
242    }
243
244    fn configured_version_override(&self) -> Option<String> {
245        self.version
246            .clone()
247            .or_else(|| env::var(PDFIUM_VERSION_ENV).ok())
248    }
249
250    fn configured_version(&self) -> String {
251        self.configured_version_override()
252            .unwrap_or_else(|| pinned_pdfium_profile().version.clone())
253    }
254
255    /// Produce a debug-only geometry-source probe from PDFium text APIs.
256    ///
257    /// The returned data is diagnostic evidence only. It is intentionally
258    /// separate from [`EthosPdfBackend::extract`] so parser behavior,
259    /// canonical JSON, and document fingerprints cannot change by accident.
260    pub fn geometry_probe(
261        &self,
262        pdf_bytes: &[u8],
263        config: &ParseConfig,
264    ) -> Result<GeometryProbeReport, EthosError> {
265        validate_pdf_header(pdf_bytes)?;
266        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
267        let runtime = PdfiumRuntime::load(self)?;
268        let doc = runtime.load_document(pdf_bytes)?;
269        let page_count = doc.page_count()?;
270        if page_count > config.limits.max_pages {
271            return Err(EthosError::new(
272                ErrorCode::PageLimitExceeded,
273                "page count exceeds configured limit",
274            ));
275        }
276        validate_page_selection(&config.pages, page_count)?;
277
278        let mut pages = Vec::new();
279        for page_index in 0..page_count {
280            let original_page = page_index + 1;
281            if !config.pages.contains(original_page) {
282                continue;
283            }
284            let page = doc.load_page(page_index)?;
285            pages.push(page.geometry_probe_page(original_page)?);
286        }
287
288        Ok(GeometryProbeReport {
289            schema_version: "ethos-pdfium-geometry-probe-v1".to_string(),
290            quantum_per_point: QUANTUM_PER_POINT,
291            backend: self.manifest(),
292            pages,
293        })
294    }
295
296    /// Render a raw BGRA crop for a 1-based page and quantized top-left bbox.
297    ///
298    /// The current boundary renders the page at 1 pixel per PDF point, then
299    /// crops the requested bbox. It is intentionally simple; direct crop-window
300    /// rendering can replace it later without changing the output contract.
301    pub fn render_crop_raw(
302        &self,
303        pdf_bytes: &[u8],
304        page_index: u32,
305        bbox: QRect,
306    ) -> Result<RawCrop, EthosError> {
307        validate_pdf_header(pdf_bytes)?;
308        if page_index == 0 {
309            return Err(EthosError::new(
310                ErrorCode::PageLimitExceeded,
311                "page selection out of document range",
312            ));
313        }
314        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
315        let runtime = PdfiumRuntime::load(self)?;
316        let doc = runtime.load_document(pdf_bytes)?;
317        let page_count = doc.page_count()?;
318        if page_index > page_count {
319            return Err(EthosError::new(
320                ErrorCode::PageLimitExceeded,
321                "page selection out of document range",
322            ));
323        }
324        let page = doc.load_page(page_index - 1)?;
325        page.render_crop_raw(page_index, bbox)
326    }
327}
328
329impl EthosPdfBackend for PdfiumBackend {
330    fn manifest(&self) -> BackendManifest {
331        let platform_sha256 = self
332            .configured_library_path()
333            .and_then(|path| std::fs::read(path).ok())
334            .map(|bytes| ethos_core::c14n::sha256_hex_bytes(&bytes))
335            .unwrap_or_else(|| "0".repeat(64));
336        BackendManifest {
337            id: "pdfium".to_string(),
338            phase: 1,
339            version: self.configured_version(),
340            platform_sha256,
341        }
342    }
343
344    fn page_count(&self, pdf_bytes: &[u8]) -> Result<u32, EthosError> {
345        validate_pdf_header(pdf_bytes)?;
346        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
347        let runtime = PdfiumRuntime::load(self)?;
348        let doc = runtime.load_document(pdf_bytes)?;
349        doc.page_count()
350    }
351
352    fn extract(&self, pdf_bytes: &[u8], config: &ParseConfig) -> Result<Extraction, EthosError> {
353        validate_pdf_header(pdf_bytes)?;
354        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
355        let runtime = PdfiumRuntime::load(self)?;
356        let doc = runtime.load_document(pdf_bytes)?;
357        let page_count = doc.page_count()?;
358        if page_count > config.limits.max_pages {
359            return Err(EthosError::new(
360                ErrorCode::PageLimitExceeded,
361                "page count exceeds configured limit",
362            ));
363        }
364        validate_page_selection(&config.pages, page_count)?;
365
366        let mut pages = Vec::new();
367        let mut spans = Vec::new();
368        let mut warnings = Vec::new();
369        let mut next_span = 1u32;
370        let mut next_warning = 1u32;
371
372        for page_index in 0..page_count {
373            let original_page = page_index + 1;
374            if !config.pages.contains(original_page) {
375                continue;
376            }
377            let page = doc.load_page(page_index)?;
378            let page_model = page.model_page(original_page)?;
379            let span_count_before = spans.len();
380            page.extract_text_spans(&page_model, &mut next_span, &mut spans)?;
381            if spans.len() == span_count_before {
382                warnings.push(Warning {
383                    id: warning_id(next_warning)?,
384                    code: WarningCode::ImageOnlyPage,
385                    message: "page has no extractable text; OCR is required for this page"
386                        .to_string(),
387                    page: Some(page_model.id.clone()),
388                    element_ref: None,
389                    span_ref: None,
390                    region_ref: None,
391                });
392                next_warning += 1;
393            }
394            pages.push(page_model);
395        }
396
397        if spans.is_empty() {
398            return Err(EthosError::new(
399                ErrorCode::OcrRequired,
400                "no extractable text; OCR is required",
401            ));
402        }
403
404        Ok(Extraction {
405            pages,
406            spans,
407            regions: Vec::new(),
408            warnings,
409        })
410    }
411}
412
413fn validate_page_selection(selection: &PageSelection, page_count: u32) -> Result<(), EthosError> {
414    selection.validate_against(page_count).map_err(|_| {
415        EthosError::new(
416            ErrorCode::PageLimitExceeded,
417            "page selection out of document range",
418        )
419    })
420}
421
422fn validate_pdf_header(pdf_bytes: &[u8]) -> Result<(), EthosError> {
423    let window = &pdf_bytes[..pdf_bytes.len().min(1024)];
424    if window.windows(5).any(|w| w == b"%PDF-") {
425        Ok(())
426    } else {
427        Err(EthosError::new(
428            ErrorCode::InvalidPdf,
429            "input does not contain a PDF header",
430        ))
431    }
432}
433
434fn quantize_coord(value: f64) -> Result<i64, EthosError> {
435    quantize(value, QUANTUM_PER_POINT)
436        .map_err(|_| EthosError::new(ErrorCode::InternalError, "coordinate quantization failed"))
437}
438
439fn pixel_extent(points: f64) -> Result<u32, EthosError> {
440    if !points.is_finite() || points <= 0.0 {
441        return Err(EthosError::new(
442            ErrorCode::CorruptPdf,
443            "PDF page has invalid dimensions",
444        ));
445    }
446    if points.ceil() > f64::from(c_int::MAX) {
447        return Err(EthosError::internal("render bitmap dimension overflow"));
448    }
449    Ok(points.ceil() as u32)
450}
451
452fn floor_quantized_pixel(value: i64) -> i64 {
453    value.div_euclid(i64::from(QUANTUM_PER_POINT))
454}
455
456fn ceil_quantized_pixel(value: i64) -> i64 {
457    let quantum = i64::from(QUANTUM_PER_POINT);
458    value
459        .checked_add(quantum - 1)
460        .unwrap_or(i64::MAX)
461        .div_euclid(quantum)
462}
463
464fn clamp_pixel(value: i64, max: u32) -> u32 {
465    value.clamp(0, i64::from(max)) as u32
466}
467
468fn crop_window(
469    bbox: QRect,
470    page_width_px: u32,
471    page_height_px: u32,
472) -> Result<(u32, u32, u32, u32), EthosError> {
473    let x0 = clamp_pixel(floor_quantized_pixel(bbox.x0), page_width_px);
474    let y0 = clamp_pixel(floor_quantized_pixel(bbox.y0), page_height_px);
475    let x1 = clamp_pixel(ceil_quantized_pixel(bbox.x1), page_width_px);
476    let y1 = clamp_pixel(ceil_quantized_pixel(bbox.y1), page_height_px);
477    if x0 >= x1 || y0 >= y1 {
478        return Err(EthosError::internal(
479            "crop bbox has no positive pixel extent",
480        ));
481    }
482    Ok((x0, y0, x1 - x0, y1 - y0))
483}
484
485fn qrect_from_pdfium_char_box(
486    page_height_pts: f64,
487    left: f64,
488    right: f64,
489    bottom: f64,
490    top: f64,
491) -> Result<QRect, EthosError> {
492    let x0 = left.min(right);
493    let x1 = left.max(right);
494    let y0 = page_height_pts - top.max(bottom);
495    let y1 = page_height_pts - top.min(bottom);
496    QRect::new(
497        quantize_coord(x0)?,
498        quantize_coord(y0)?,
499        quantize_coord(x1)?,
500        quantize_coord(y1)?,
501    )
502    .map_err(|_| EthosError::internal("malformed character bbox"))
503}
504
505fn union_rect(a: QRect, b: QRect) -> QRect {
506    QRect {
507        x0: a.x0.min(b.x0),
508        y0: a.y0.min(b.y0),
509        x1: a.x1.max(b.x1),
510        y1: a.y1.max(b.y1),
511    }
512}
513
514fn map_pdfium_error(code: c_ulong) -> EthosError {
515    match code {
516        4 => EthosError::new(
517            ErrorCode::PasswordProtected,
518            "document is encrypted or password-protected",
519        ),
520        5 => EthosError::new(
521            ErrorCode::UnsupportedPdfFeature,
522            "document uses a restricted security handler",
523        ),
524        3 => EthosError::new(ErrorCode::CorruptPdf, "PDF structure is corrupt"),
525        6 => EthosError::new(ErrorCode::CorruptPdf, "PDF page tree is corrupt"),
526        2 => EthosError::new(ErrorCode::CorruptPdf, "PDF could not be loaded"),
527        _ => EthosError::new(ErrorCode::CorruptPdf, "PDFium could not load the document"),
528    }
529}
530
531#[derive(Debug, Deserialize)]
532struct DeterministicProfile {
533    backend: PinnedPdfiumBackend,
534}
535
536#[derive(Debug, Deserialize)]
537struct PinnedPdfiumBackend {
538    id: String,
539    phase: u8,
540    version: String,
541    upstream_version: String,
542    v8: String,
543    xfa: String,
544    distribution: PinnedPdfiumDistribution,
545    build_flags: PinnedPdfiumBuildFlags,
546    platform_hashes: BTreeMap<String, String>,
547    platform_artifacts: BTreeMap<String, PinnedPdfiumArtifact>,
548    profile_doc: String,
549}
550
551#[derive(Debug, Deserialize)]
552struct PinnedPdfiumDistribution {
553    source: String,
554    release_url: String,
555    published_at: String,
556    attestation: PinnedPdfiumAttestation,
557}
558
559#[derive(Debug, Deserialize)]
560struct PinnedPdfiumAttestation {
561    name: String,
562    sha256: String,
563}
564
565#[derive(Debug, Deserialize)]
566struct PinnedPdfiumBuildFlags {
567    is_component_build: bool,
568    is_debug: bool,
569    pdf_enable_v8: bool,
570    pdf_enable_xfa: bool,
571    pdf_is_standalone: bool,
572    pdf_use_partition_alloc: bool,
573}
574
575#[derive(Debug, Deserialize)]
576struct PinnedPdfiumArtifact {
577    name: String,
578    target_os: String,
579    target_cpu: String,
580    runtime_library_path: String,
581    runtime_library_sha256: String,
582}
583
584fn pinned_pdfium_profile() -> &'static PinnedPdfiumBackend {
585    PINNED_PDFIUM_PROFILE.get_or_init(|| {
586        let profile: DeterministicProfile = serde_json::from_str(DETERMINISTIC_PROFILE_JSON)
587            .expect("profiles/ethos-deterministic-v1.json is valid JSON");
588        validate_pinned_pdfium_profile(&profile.backend)
589            .expect("profiles/ethos-deterministic-v1.json pins a valid PDFium Phase 1 profile");
590        profile.backend
591    })
592}
593
594fn validate_pinned_pdfium_profile(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
595    validate_pinned_pdfium_identity(profile)?;
596    validate_pinned_pdfium_distribution(&profile.distribution)?;
597    validate_pinned_pdfium_build_flags(&profile.build_flags)?;
598    validate_pinned_pdfium_platforms(profile)?;
599    Ok(())
600}
601
602fn validate_pinned_pdfium_identity(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
603    if profile.id != "pdfium"
604        || profile.phase != 1
605        || profile.version != "chromium/7881"
606        || profile.upstream_version != "PDFium 151.0.7881.0"
607        || profile.v8 != "disabled"
608        || profile.xfa != "disabled"
609        || profile.profile_doc != "docs/pdfium-profile.md"
610    {
611        return Err("unexpected PDFium profile identity");
612    }
613    Ok(())
614}
615
616fn validate_pinned_pdfium_distribution(
617    distribution: &PinnedPdfiumDistribution,
618) -> Result<(), &'static str> {
619    if distribution.source != "bblanchon/pdfium-binaries"
620        || distribution.attestation.name != "pdfium-attestation.json"
621        || !is_sha256_hex(&distribution.attestation.sha256)
622        || !distribution
623            .release_url
624            .starts_with("https://github.com/bblanchon/pdfium-binaries/releases/tag/")
625        || !distribution.published_at.ends_with('Z')
626    {
627        return Err("unexpected PDFium distribution metadata");
628    }
629    Ok(())
630}
631
632fn validate_pinned_pdfium_build_flags(
633    build_flags: &PinnedPdfiumBuildFlags,
634) -> Result<(), &'static str> {
635    if build_flags.is_component_build
636        || build_flags.is_debug
637        || build_flags.pdf_enable_v8
638        || build_flags.pdf_enable_xfa
639        || !build_flags.pdf_is_standalone
640        || build_flags.pdf_use_partition_alloc
641    {
642        return Err("PDFium Phase 1 must be standalone release with V8/XFA disabled");
643    }
644    Ok(())
645}
646
647fn validate_pinned_pdfium_platforms(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
648    for platform in ["macos-arm64", "linux-x64", "windows-x64"] {
649        let artifact_hash = profile
650            .platform_hashes
651            .get(platform)
652            .ok_or("missing PDFium artifact hash")?;
653        if !is_sha256_hex(artifact_hash) {
654            return Err("malformed PDFium artifact hash");
655        }
656        let artifact = profile
657            .platform_artifacts
658            .get(platform)
659            .ok_or("missing PDFium platform artifact metadata")?;
660        if artifact.name.contains("-v8-")
661            || artifact.name.contains("xfa")
662            || !artifact.name.ends_with(".tgz")
663            || artifact.runtime_library_path.is_empty()
664            || !is_sha256_hex(&artifact.runtime_library_sha256)
665        {
666            return Err("malformed PDFium platform artifact metadata");
667        }
668        match platform {
669            "macos-arm64"
670                if artifact.name == "pdfium-mac-arm64.tgz"
671                    && artifact.target_os == "mac"
672                    && artifact.target_cpu == "arm64" => {}
673            "linux-x64"
674                if artifact.name == "pdfium-linux-x64.tgz"
675                    && artifact.target_os == "linux"
676                    && artifact.target_cpu == "x64" => {}
677            "windows-x64"
678                if artifact.name == "pdfium-win-x64.tgz"
679                    && artifact.target_os == "win"
680                    && artifact.target_cpu == "x64" => {}
681            _ => return Err("unexpected PDFium platform artifact"),
682        }
683    }
684    Ok(())
685}
686
687fn is_sha256_hex(value: &str) -> bool {
688    value.len() == 64
689        && value
690            .bytes()
691            .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase())
692}
693
694fn current_platform_key() -> Option<&'static str> {
695    if cfg!(all(target_os = "macos", target_arch = "aarch64")) {
696        Some("macos-arm64")
697    } else if cfg!(all(target_os = "linux", target_arch = "x86_64")) {
698        Some("linux-x64")
699    } else if cfg!(all(target_os = "windows", target_arch = "x86_64")) {
700        Some("windows-x64")
701    } else {
702        None
703    }
704}
705
706fn current_pdfium_pins(
707    profile: &PinnedPdfiumBackend,
708) -> Result<(&'static str, &str, &PinnedPdfiumArtifact), EthosError> {
709    let platform = current_platform_key().ok_or_else(|| {
710        EthosError::internal("pdfium phase 1 profile has no hash for this platform")
711    })?;
712    let artifact_hash = profile.platform_hashes.get(platform).ok_or_else(|| {
713        EthosError::internal("pdfium phase 1 profile has no hash for this platform")
714    })?;
715    let artifact = profile.platform_artifacts.get(platform).ok_or_else(|| {
716        EthosError::internal("pdfium phase 1 profile has no artifact for this platform")
717    })?;
718    Ok((platform, artifact_hash.as_str(), artifact))
719}
720
721fn validate_pinned_pdfium_payload(
722    backend: &PdfiumBackend,
723    library_path: &Path,
724) -> Result<(), EthosError> {
725    let profile = pinned_pdfium_profile();
726    if let Some(version) = backend.configured_version_override() {
727        let upstream_number = profile
728            .upstream_version
729            .strip_prefix("PDFium ")
730            .unwrap_or(&profile.upstream_version);
731        if version != profile.version
732            && version != profile.upstream_version
733            && version != upstream_number
734        {
735            return Err(EthosError::internal(
736                "pdfium version does not match pinned phase 1 profile",
737            ));
738        }
739    }
740
741    let (_, artifact_hash, artifact) = current_pdfium_pins(profile)?;
742    if let Some(artifact_path) = backend.configured_artifact_path() {
743        if !artifact_path.is_file() {
744            return Err(EthosError::internal(
745                "pdfium artifact path does not point to a file",
746            ));
747        }
748        let actual_artifact_hash = sha256_file(&artifact_path)?;
749        if actual_artifact_hash != artifact_hash {
750            return Err(EthosError::internal(
751                "pdfium artifact does not match pinned phase 1 profile",
752            ));
753        }
754    }
755
756    let library_hash = sha256_file(library_path)?;
757    if library_hash != artifact.runtime_library_sha256 {
758        return Err(EthosError::internal(
759            "pdfium library does not match pinned phase 1 profile",
760        ));
761    }
762
763    Ok(())
764}
765
766fn sha256_file(path: &Path) -> Result<String, EthosError> {
767    let bytes =
768        std::fs::read(path).map_err(|_| EthosError::internal("failed to read pdfium payload"))?;
769    Ok(ethos_core::c14n::sha256_hex_bytes(&bytes))
770}
771
772type FpdfDocument = *mut c_void;
773type FpdfPage = *mut c_void;
774type FpdfTextPage = *mut c_void;
775type FpdfBitmap = *mut c_void;
776
777#[cfg(not(windows))]
778type FpdfInitLibrary = unsafe extern "C" fn();
779#[cfg(windows)]
780type FpdfInitLibrary = unsafe extern "system" fn();
781#[cfg(not(windows))]
782type FpdfDestroyLibrary = unsafe extern "C" fn();
783#[cfg(windows)]
784type FpdfDestroyLibrary = unsafe extern "system" fn();
785#[cfg(not(windows))]
786type FpdfLoadMemDocument64 =
787    unsafe extern "C" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
788#[cfg(windows)]
789type FpdfLoadMemDocument64 =
790    unsafe extern "system" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
791#[cfg(not(windows))]
792type FpdfCloseDocument = unsafe extern "C" fn(FpdfDocument);
793#[cfg(windows)]
794type FpdfCloseDocument = unsafe extern "system" fn(FpdfDocument);
795#[cfg(not(windows))]
796type FpdfGetLastError = unsafe extern "C" fn() -> c_ulong;
797#[cfg(windows)]
798type FpdfGetLastError = unsafe extern "system" fn() -> c_ulong;
799#[cfg(not(windows))]
800type FpdfGetPageCount = unsafe extern "C" fn(FpdfDocument) -> c_int;
801#[cfg(windows)]
802type FpdfGetPageCount = unsafe extern "system" fn(FpdfDocument) -> c_int;
803#[cfg(not(windows))]
804type FpdfLoadPage = unsafe extern "C" fn(FpdfDocument, c_int) -> FpdfPage;
805#[cfg(windows)]
806type FpdfLoadPage = unsafe extern "system" fn(FpdfDocument, c_int) -> FpdfPage;
807#[cfg(not(windows))]
808type FpdfClosePage = unsafe extern "C" fn(FpdfPage);
809#[cfg(windows)]
810type FpdfClosePage = unsafe extern "system" fn(FpdfPage);
811#[cfg(not(windows))]
812type FpdfGetPageWidthF = unsafe extern "C" fn(FpdfPage) -> f32;
813#[cfg(windows)]
814type FpdfGetPageWidthF = unsafe extern "system" fn(FpdfPage) -> f32;
815#[cfg(not(windows))]
816type FpdfGetPageHeightF = unsafe extern "C" fn(FpdfPage) -> f32;
817#[cfg(windows)]
818type FpdfGetPageHeightF = unsafe extern "system" fn(FpdfPage) -> f32;
819#[cfg(not(windows))]
820type FpdfPageGetRotation = unsafe extern "C" fn(FpdfPage) -> c_int;
821#[cfg(windows)]
822type FpdfPageGetRotation = unsafe extern "system" fn(FpdfPage) -> c_int;
823#[cfg(not(windows))]
824type FpdfTextLoadPage = unsafe extern "C" fn(FpdfPage) -> FpdfTextPage;
825#[cfg(windows)]
826type FpdfTextLoadPage = unsafe extern "system" fn(FpdfPage) -> FpdfTextPage;
827#[cfg(not(windows))]
828type FpdfTextClosePage = unsafe extern "C" fn(FpdfTextPage);
829#[cfg(windows)]
830type FpdfTextClosePage = unsafe extern "system" fn(FpdfTextPage);
831#[cfg(not(windows))]
832type FpdfTextCountChars = unsafe extern "C" fn(FpdfTextPage) -> c_int;
833#[cfg(windows)]
834type FpdfTextCountChars = unsafe extern "system" fn(FpdfTextPage) -> c_int;
835#[cfg(not(windows))]
836type FpdfTextGetUnicode = unsafe extern "C" fn(FpdfTextPage, c_int) -> u32;
837#[cfg(windows)]
838type FpdfTextGetUnicode = unsafe extern "system" fn(FpdfTextPage, c_int) -> u32;
839#[cfg(not(windows))]
840type FpdfTextGetCharBox =
841    unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
842#[cfg(windows)]
843type FpdfTextGetCharBox =
844    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
845#[cfg(not(windows))]
846type FpdfTextGetLooseCharBox = unsafe extern "C" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
847#[cfg(windows)]
848type FpdfTextGetLooseCharBox =
849    unsafe extern "system" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
850#[cfg(not(windows))]
851type FpdfTextGetCharOrigin = unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
852#[cfg(windows)]
853type FpdfTextGetCharOrigin =
854    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
855#[cfg(not(windows))]
856type FpdfTextCountRects = unsafe extern "C" fn(FpdfTextPage, c_int, c_int) -> c_int;
857#[cfg(windows)]
858type FpdfTextCountRects = unsafe extern "system" fn(FpdfTextPage, c_int, c_int) -> c_int;
859#[cfg(not(windows))]
860type FpdfTextGetRect =
861    unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
862#[cfg(windows)]
863type FpdfTextGetRect =
864    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
865#[cfg(not(windows))]
866type FpdfTextGetFontSize = unsafe extern "C" fn(FpdfTextPage, c_int) -> f64;
867#[cfg(windows)]
868type FpdfTextGetFontSize = unsafe extern "system" fn(FpdfTextPage, c_int) -> f64;
869#[cfg(not(windows))]
870type FpdfTextGetFontInfo =
871    unsafe extern "C" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
872#[cfg(windows)]
873type FpdfTextGetFontInfo =
874    unsafe extern "system" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
875#[cfg(not(windows))]
876type FpdfTextIsGenerated = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
877#[cfg(windows)]
878type FpdfTextIsGenerated = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
879#[cfg(not(windows))]
880type FpdfTextIsHyphen = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
881#[cfg(windows)]
882type FpdfTextIsHyphen = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
883#[cfg(not(windows))]
884type FpdfBitmapCreate = unsafe extern "C" fn(c_int, c_int, c_int) -> FpdfBitmap;
885#[cfg(windows)]
886type FpdfBitmapCreate = unsafe extern "system" fn(c_int, c_int, c_int) -> FpdfBitmap;
887#[cfg(not(windows))]
888type FpdfBitmapDestroy = unsafe extern "C" fn(FpdfBitmap);
889#[cfg(windows)]
890type FpdfBitmapDestroy = unsafe extern "system" fn(FpdfBitmap);
891#[cfg(not(windows))]
892type FpdfBitmapFillRect = unsafe extern "C" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
893#[cfg(windows)]
894type FpdfBitmapFillRect =
895    unsafe extern "system" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
896#[cfg(not(windows))]
897type FpdfBitmapGetBuffer = unsafe extern "C" fn(FpdfBitmap) -> *mut c_void;
898#[cfg(windows)]
899type FpdfBitmapGetBuffer = unsafe extern "system" fn(FpdfBitmap) -> *mut c_void;
900#[cfg(not(windows))]
901type FpdfBitmapGetStride = unsafe extern "C" fn(FpdfBitmap) -> c_int;
902#[cfg(windows)]
903type FpdfBitmapGetStride = unsafe extern "system" fn(FpdfBitmap) -> c_int;
904#[cfg(not(windows))]
905type FpdfRenderPageBitmap =
906    unsafe extern "C" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
907#[cfg(windows)]
908type FpdfRenderPageBitmap =
909    unsafe extern "system" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
910
911#[repr(C)]
912#[derive(Clone, Copy, Debug, Default)]
913struct FsRectF {
914    left: f32,
915    top: f32,
916    right: f32,
917    bottom: f32,
918}
919
920#[derive(Clone, Copy)]
921struct PdfiumFunctions {
922    init_library: FpdfInitLibrary,
923    destroy_library: FpdfDestroyLibrary,
924    load_mem_document64: FpdfLoadMemDocument64,
925    close_document: FpdfCloseDocument,
926    get_last_error: FpdfGetLastError,
927    get_page_count: FpdfGetPageCount,
928    load_page: FpdfLoadPage,
929    close_page: FpdfClosePage,
930    get_page_width_f: FpdfGetPageWidthF,
931    get_page_height_f: FpdfGetPageHeightF,
932    page_get_rotation: Option<FpdfPageGetRotation>,
933    text_load_page: FpdfTextLoadPage,
934    text_close_page: FpdfTextClosePage,
935    text_count_chars: FpdfTextCountChars,
936    text_get_unicode: FpdfTextGetUnicode,
937    text_get_char_box: FpdfTextGetCharBox,
938    text_get_loose_char_box: Option<FpdfTextGetLooseCharBox>,
939    text_get_char_origin: Option<FpdfTextGetCharOrigin>,
940    text_count_rects: Option<FpdfTextCountRects>,
941    text_get_rect: Option<FpdfTextGetRect>,
942    text_get_font_size: FpdfTextGetFontSize,
943    text_get_font_info: Option<FpdfTextGetFontInfo>,
944    text_is_generated: Option<FpdfTextIsGenerated>,
945    text_is_hyphen: Option<FpdfTextIsHyphen>,
946    bitmap_create: Option<FpdfBitmapCreate>,
947    bitmap_destroy: Option<FpdfBitmapDestroy>,
948    bitmap_fill_rect: Option<FpdfBitmapFillRect>,
949    bitmap_get_buffer: Option<FpdfBitmapGetBuffer>,
950    bitmap_get_stride: Option<FpdfBitmapGetStride>,
951    render_page_bitmap: Option<FpdfRenderPageBitmap>,
952}
953
954impl PdfiumFunctions {
955    fn load(library: &dylib::Library) -> Result<Self, EthosError> {
956        // SAFETY: symbols are loaded from the configured PDFium dynamic library and
957        // immediately copied into typed function pointers matching the C API.
958        unsafe {
959            Ok(PdfiumFunctions {
960                init_library: library.symbol(b"FPDF_InitLibrary\0")?,
961                destroy_library: library.symbol(b"FPDF_DestroyLibrary\0")?,
962                load_mem_document64: library.symbol(b"FPDF_LoadMemDocument64\0")?,
963                close_document: library.symbol(b"FPDF_CloseDocument\0")?,
964                get_last_error: library.symbol(b"FPDF_GetLastError\0")?,
965                get_page_count: library.symbol(b"FPDF_GetPageCount\0")?,
966                load_page: library.symbol(b"FPDF_LoadPage\0")?,
967                close_page: library.symbol(b"FPDF_ClosePage\0")?,
968                get_page_width_f: library.symbol(b"FPDF_GetPageWidthF\0")?,
969                get_page_height_f: library.symbol(b"FPDF_GetPageHeightF\0")?,
970                page_get_rotation: library.optional_symbol(b"FPDFPage_GetRotation\0"),
971                text_load_page: library.symbol(b"FPDFText_LoadPage\0")?,
972                text_close_page: library.symbol(b"FPDFText_ClosePage\0")?,
973                text_count_chars: library.symbol(b"FPDFText_CountChars\0")?,
974                text_get_unicode: library.symbol(b"FPDFText_GetUnicode\0")?,
975                text_get_char_box: library.symbol(b"FPDFText_GetCharBox\0")?,
976                text_get_loose_char_box: library.optional_symbol(b"FPDFText_GetLooseCharBox\0"),
977                text_get_char_origin: library.optional_symbol(b"FPDFText_GetCharOrigin\0"),
978                text_count_rects: library.optional_symbol(b"FPDFText_CountRects\0"),
979                text_get_rect: library.optional_symbol(b"FPDFText_GetRect\0"),
980                text_get_font_size: library.symbol(b"FPDFText_GetFontSize\0")?,
981                text_get_font_info: library.optional_symbol(b"FPDFText_GetFontInfo\0"),
982                text_is_generated: library.optional_symbol(b"FPDFText_IsGenerated\0"),
983                text_is_hyphen: library.optional_symbol(b"FPDFText_IsHyphen\0"),
984                bitmap_create: library.optional_symbol(b"FPDFBitmap_Create\0"),
985                bitmap_destroy: library.optional_symbol(b"FPDFBitmap_Destroy\0"),
986                bitmap_fill_rect: library.optional_symbol(b"FPDFBitmap_FillRect\0"),
987                bitmap_get_buffer: library.optional_symbol(b"FPDFBitmap_GetBuffer\0"),
988                bitmap_get_stride: library.optional_symbol(b"FPDFBitmap_GetStride\0"),
989                render_page_bitmap: library.optional_symbol(b"FPDF_RenderPageBitmap\0"),
990            })
991        }
992    }
993
994    fn geometry_probe_symbols(self) -> GeometryProbeSymbols {
995        GeometryProbeSymbols {
996            char_origin: self.text_get_char_origin.is_some(),
997            loose_char_box: self.text_get_loose_char_box.is_some(),
998            text_rects: self.text_count_rects.is_some() && self.text_get_rect.is_some(),
999        }
1000    }
1001}
1002
1003struct PdfiumRuntime {
1004    _library: dylib::Library,
1005    funcs: PdfiumFunctions,
1006    initialized: bool,
1007}
1008
1009impl PdfiumRuntime {
1010    fn load(backend: &PdfiumBackend) -> Result<Self, EthosError> {
1011        let path = backend.configured_library_path().ok_or_else(|| {
1012            EthosError::internal(format!(
1013                "pdfium library path is not configured; set {PDFIUM_LIBRARY_PATH_ENV}"
1014            ))
1015        })?;
1016        if !path.is_file() {
1017            return Err(EthosError::internal(
1018                "pdfium library path does not point to a file",
1019            ));
1020        }
1021        validate_pinned_pdfium_payload(backend, &path)?;
1022
1023        let library = dylib::Library::open(&path)?;
1024        let funcs = PdfiumFunctions::load(&library)?;
1025        // SAFETY: FPDF_InitLibrary initializes process-global PDFium state. Calls are
1026        // serialized by PDFIUM_LOCK.
1027        unsafe { (funcs.init_library)() };
1028        Ok(PdfiumRuntime {
1029            _library: library,
1030            funcs,
1031            initialized: true,
1032        })
1033    }
1034
1035    fn load_document<'a>(&'a self, pdf_bytes: &[u8]) -> Result<PdfDocument<'a>, EthosError> {
1036        // SAFETY: PDFium reads the immutable byte slice only for the duration of
1037        // FPDF_LoadMemDocument64. The returned document is closed by PdfDocument::drop.
1038        let handle = unsafe {
1039            (self.funcs.load_mem_document64)(
1040                pdf_bytes.as_ptr().cast(),
1041                pdf_bytes.len(),
1042                ptr::null(),
1043            )
1044        };
1045        if handle.is_null() {
1046            // SAFETY: FPDF_GetLastError has no preconditions after a failed load.
1047            let code = unsafe { (self.funcs.get_last_error)() };
1048            Err(map_pdfium_error(code))
1049        } else {
1050            Ok(PdfDocument {
1051                funcs: &self.funcs,
1052                handle,
1053            })
1054        }
1055    }
1056}
1057
1058impl Drop for PdfiumRuntime {
1059    fn drop(&mut self) {
1060        if self.initialized {
1061            // SAFETY: paired with FPDF_InitLibrary above and serialized by PDFIUM_LOCK.
1062            unsafe { (self.funcs.destroy_library)() };
1063        }
1064    }
1065}
1066
1067struct PdfDocument<'a> {
1068    funcs: &'a PdfiumFunctions,
1069    handle: FpdfDocument,
1070}
1071
1072impl PdfDocument<'_> {
1073    fn page_count(&self) -> Result<u32, EthosError> {
1074        // SAFETY: handle is a live FPDF_DOCUMENT owned by self.
1075        let count = unsafe { (self.funcs.get_page_count)(self.handle) };
1076        if count <= 0 {
1077            return Err(EthosError::new(
1078                ErrorCode::CorruptPdf,
1079                "PDF has no readable pages",
1080            ));
1081        }
1082        u32::try_from(count).map_err(|_| EthosError::internal("page count overflow"))
1083    }
1084
1085    fn load_page(&self, page_index: u32) -> Result<PdfPage<'_>, EthosError> {
1086        let index =
1087            c_int::try_from(page_index).map_err(|_| EthosError::internal("page overflow"))?;
1088        // SAFETY: handle is live and index has been bounded by the caller.
1089        let handle = unsafe { (self.funcs.load_page)(self.handle, index) };
1090        if handle.is_null() {
1091            // SAFETY: FPDF_GetLastError has no preconditions after a failed page load.
1092            let code = unsafe { (self.funcs.get_last_error)() };
1093            Err(map_pdfium_error(code))
1094        } else {
1095            Ok(PdfPage {
1096                funcs: self.funcs,
1097                handle,
1098            })
1099        }
1100    }
1101}
1102
1103impl Drop for PdfDocument<'_> {
1104    fn drop(&mut self) {
1105        // SAFETY: handle is a live FPDF_DOCUMENT and is closed exactly once here.
1106        unsafe { (self.funcs.close_document)(self.handle) };
1107    }
1108}
1109
1110struct PdfPage<'a> {
1111    funcs: &'a PdfiumFunctions,
1112    handle: FpdfPage,
1113}
1114
1115impl PdfPage<'_> {
1116    fn width_pts(&self) -> f64 {
1117        // SAFETY: handle is a live FPDF_PAGE.
1118        unsafe { (self.funcs.get_page_width_f)(self.handle) as f64 }
1119    }
1120
1121    fn height_pts(&self) -> f64 {
1122        // SAFETY: handle is a live FPDF_PAGE.
1123        unsafe { (self.funcs.get_page_height_f)(self.handle) as f64 }
1124    }
1125
1126    fn rotation(&self) -> u16 {
1127        let Some(get_rotation) = self.funcs.page_get_rotation else {
1128            return 0;
1129        };
1130        // SAFETY: handle is a live FPDF_PAGE.
1131        match unsafe { get_rotation(self.handle) }.rem_euclid(4) {
1132            1 => 90,
1133            2 => 180,
1134            3 => 270,
1135            _ => 0,
1136        }
1137    }
1138
1139    fn model_page(&self, original_page: u32) -> Result<Page, EthosError> {
1140        Ok(Page {
1141            id: page_id(original_page)?,
1142            index: original_page,
1143            width: quantize_coord(self.width_pts())?,
1144            height: quantize_coord(self.height_pts())?,
1145            rotation: self.rotation(),
1146        })
1147    }
1148
1149    fn geometry_probe_page(&self, original_page: u32) -> Result<GeometryProbePage, EthosError> {
1150        let page = self.model_page(original_page)?;
1151        // SAFETY: handle is a live FPDF_PAGE. Text page is closed by PdfTextPage::drop.
1152        let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1153        if text_handle.is_null() {
1154            return Ok(GeometryProbePage {
1155                id: page.id,
1156                index: page.index,
1157                width: page.width,
1158                height: page.height,
1159                rotation: page.rotation,
1160                char_count: 0,
1161                symbols: self.funcs.geometry_probe_symbols(),
1162                chars: Vec::new(),
1163                runs: Vec::new(),
1164            });
1165        }
1166        let text_page = PdfTextPage {
1167            funcs: self.funcs,
1168            handle: text_handle,
1169        };
1170        text_page.geometry_probe(&page, self.height_pts())
1171    }
1172
1173    fn extract_text_spans(
1174        &self,
1175        page: &Page,
1176        next_span: &mut u32,
1177        spans: &mut Vec<Span>,
1178    ) -> Result<(), EthosError> {
1179        // SAFETY: handle is a live FPDF_PAGE. Text page is closed by PdfTextPage::drop.
1180        let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1181        if text_handle.is_null() {
1182            return Ok(());
1183        }
1184        let text_page = PdfTextPage {
1185            funcs: self.funcs,
1186            handle: text_handle,
1187        };
1188        text_page.extract_runs(page, self.height_pts(), next_span, spans)
1189    }
1190
1191    fn render_crop_raw(&self, page_index: u32, bbox: QRect) -> Result<RawCrop, EthosError> {
1192        let bitmap = RenderBitmap::render_page(
1193            self.funcs,
1194            self.handle,
1195            pixel_extent(self.width_pts())?,
1196            pixel_extent(self.height_pts())?,
1197        )?;
1198        let (x0, y0, width_px, height_px) = crop_window(bbox, bitmap.width_px, bitmap.height_px)?;
1199        let bytes = bitmap.crop_bytes(x0, y0, width_px, height_px)?;
1200        Ok(RawCrop {
1201            page_index,
1202            bbox,
1203            width_px,
1204            height_px,
1205            stride: width_px
1206                .checked_mul(4)
1207                .ok_or_else(|| EthosError::internal("crop stride overflow"))?,
1208            pixel_format: "bgra_8u",
1209            sha256: ethos_core::c14n::sha256_hex_bytes(&bytes),
1210            bytes,
1211        })
1212    }
1213}
1214
1215impl Drop for PdfPage<'_> {
1216    fn drop(&mut self) {
1217        // SAFETY: handle is a live FPDF_PAGE and is closed exactly once here.
1218        unsafe { (self.funcs.close_page)(self.handle) };
1219    }
1220}
1221
1222struct PdfTextPage<'a> {
1223    funcs: &'a PdfiumFunctions,
1224    handle: FpdfTextPage,
1225}
1226
1227struct RenderBitmap<'a> {
1228    funcs: &'a PdfiumFunctions,
1229    handle: FpdfBitmap,
1230    width_px: u32,
1231    height_px: u32,
1232    stride: usize,
1233}
1234
1235impl RenderBitmap<'_> {
1236    fn render_page(
1237        funcs: &PdfiumFunctions,
1238        page: FpdfPage,
1239        width_px: u32,
1240        height_px: u32,
1241    ) -> Result<RenderBitmap<'_>, EthosError> {
1242        let Some(bitmap_create) = funcs.bitmap_create else {
1243            return Err(EthosError::internal(
1244                "pdfium library is missing bitmap render symbols",
1245            ));
1246        };
1247        let Some(bitmap_fill_rect) = funcs.bitmap_fill_rect else {
1248            return Err(EthosError::internal(
1249                "pdfium library is missing bitmap render symbols",
1250            ));
1251        };
1252        let Some(render_page_bitmap) = funcs.render_page_bitmap else {
1253            return Err(EthosError::internal(
1254                "pdfium library is missing bitmap render symbols",
1255            ));
1256        };
1257        let width = c_int::try_from(width_px)
1258            .map_err(|_| EthosError::internal("render bitmap width overflow"))?;
1259        let height = c_int::try_from(height_px)
1260            .map_err(|_| EthosError::internal("render bitmap height overflow"))?;
1261
1262        // SAFETY: width/height are positive bounded c_int values. Bitmap is destroyed by Drop.
1263        let handle = unsafe { bitmap_create(width, height, 1) };
1264        if handle.is_null() {
1265            return Err(EthosError::internal(
1266                "pdfium failed to allocate render bitmap",
1267            ));
1268        }
1269        let mut bitmap = RenderBitmap {
1270            funcs,
1271            handle,
1272            width_px,
1273            height_px,
1274            stride: 0,
1275        };
1276        // SAFETY: handle is a live bitmap. Fill with opaque white for deterministic background.
1277        unsafe { bitmap_fill_rect(bitmap.handle, 0, 0, width, height, 0xFFFF_FFFF) };
1278        // SAFETY: handle and page are live. Render uses no callbacks and writes into the bitmap.
1279        unsafe { render_page_bitmap(bitmap.handle, page, 0, 0, width, height, 0, 0) };
1280        bitmap.stride = bitmap.read_stride()?;
1281        Ok(bitmap)
1282    }
1283
1284    fn read_stride(&self) -> Result<usize, EthosError> {
1285        let Some(bitmap_get_stride) = self.funcs.bitmap_get_stride else {
1286            return Err(EthosError::internal(
1287                "pdfium library is missing bitmap render symbols",
1288            ));
1289        };
1290        // SAFETY: handle is a live bitmap.
1291        let stride = unsafe { bitmap_get_stride(self.handle) };
1292        if stride <= 0 {
1293            return Err(EthosError::internal(
1294                "pdfium render bitmap has invalid stride",
1295            ));
1296        }
1297        usize::try_from(stride).map_err(|_| EthosError::internal("render bitmap stride overflow"))
1298    }
1299
1300    fn crop_bytes(
1301        &self,
1302        x0: u32,
1303        y0: u32,
1304        width_px: u32,
1305        height_px: u32,
1306    ) -> Result<Vec<u8>, EthosError> {
1307        let Some(bitmap_get_buffer) = self.funcs.bitmap_get_buffer else {
1308            return Err(EthosError::internal(
1309                "pdfium library is missing bitmap render symbols",
1310            ));
1311        };
1312        // SAFETY: handle is a live bitmap.
1313        let ptr = unsafe { bitmap_get_buffer(self.handle) };
1314        if ptr.is_null() {
1315            return Err(EthosError::internal("pdfium render bitmap has null buffer"));
1316        }
1317        let full_len = self
1318            .stride
1319            .checked_mul(
1320                usize::try_from(self.height_px)
1321                    .map_err(|_| EthosError::internal("render bitmap height overflow"))?,
1322            )
1323            .ok_or_else(|| EthosError::internal("render bitmap buffer length overflow"))?;
1324        // SAFETY: PDFium owns a live bitmap buffer of stride * height bytes for this bitmap.
1325        let full = unsafe { slice::from_raw_parts(ptr.cast::<u8>(), full_len) };
1326
1327        let x0 = usize::try_from(x0).map_err(|_| EthosError::internal("crop x overflow"))?;
1328        let y0 = usize::try_from(y0).map_err(|_| EthosError::internal("crop y overflow"))?;
1329        let width =
1330            usize::try_from(width_px).map_err(|_| EthosError::internal("crop width overflow"))?;
1331        let height =
1332            usize::try_from(height_px).map_err(|_| EthosError::internal("crop height overflow"))?;
1333        let row_bytes = width
1334            .checked_mul(4)
1335            .ok_or_else(|| EthosError::internal("crop row width overflow"))?;
1336        let mut out = Vec::with_capacity(
1337            row_bytes
1338                .checked_mul(height)
1339                .ok_or_else(|| EthosError::internal("crop buffer length overflow"))?,
1340        );
1341        for row in 0..height {
1342            let src_start = y0
1343                .checked_add(row)
1344                .and_then(|y| y.checked_mul(self.stride))
1345                .and_then(|base| base.checked_add(x0.checked_mul(4)?))
1346                .ok_or_else(|| EthosError::internal("crop source offset overflow"))?;
1347            let src_end = src_start
1348                .checked_add(row_bytes)
1349                .ok_or_else(|| EthosError::internal("crop source row overflow"))?;
1350            if src_end > full.len() {
1351                return Err(EthosError::internal(
1352                    "crop source row exceeds render bitmap",
1353                ));
1354            }
1355            out.extend_from_slice(&full[src_start..src_end]);
1356        }
1357        Ok(out)
1358    }
1359}
1360
1361impl Drop for RenderBitmap<'_> {
1362    fn drop(&mut self) {
1363        if let Some(bitmap_destroy) = self.funcs.bitmap_destroy {
1364            // SAFETY: handle is a live FPDF_BITMAP and is destroyed exactly once here.
1365            unsafe { bitmap_destroy(self.handle) };
1366        }
1367    }
1368}
1369
1370impl PdfTextPage<'_> {
1371    fn geometry_probe(
1372        &self,
1373        page: &Page,
1374        page_height_pts: f64,
1375    ) -> Result<GeometryProbePage, EthosError> {
1376        // SAFETY: handle is a live FPDF_TEXTPAGE.
1377        let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1378        if count < 0 {
1379            return Err(EthosError::new(
1380                ErrorCode::CorruptPdf,
1381                "PDF text page could not be read",
1382            ));
1383        }
1384
1385        let mut chars = Vec::new();
1386        let mut run = GeometryRunBuilder::default();
1387        let mut runs = Vec::new();
1388        let mut next_run = 1u32;
1389        for index in 0..count {
1390            let record = self.geometry_probe_char(index, page_height_pts)?;
1391            match record.parser_action.as_str() {
1392                "include" => {
1393                    if run.has_style_change(&record.font_id, record.font_size_q, record.font_flags)
1394                    {
1395                        run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1396                    }
1397                    run.push(&record);
1398                }
1399                "skip_generated_hyphen" => {}
1400                _ => run.flush(self, page_height_pts, &mut next_run, &mut runs)?,
1401            }
1402            chars.push(record);
1403        }
1404        run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1405
1406        Ok(GeometryProbePage {
1407            id: page.id.clone(),
1408            index: page.index,
1409            width: page.width,
1410            height: page.height,
1411            rotation: page.rotation,
1412            char_count: count,
1413            symbols: self.funcs.geometry_probe_symbols(),
1414            chars,
1415            runs,
1416        })
1417    }
1418
1419    fn geometry_probe_char(
1420        &self,
1421        index: c_int,
1422        page_height_pts: f64,
1423    ) -> Result<GeometryProbeChar, EthosError> {
1424        // SAFETY: index is in range for this text page.
1425        let unicode = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1426        let ch = char::from_u32(unicode);
1427        let parser_action = match ch {
1428            None => "break_invalid_unicode",
1429            Some(_) if self.is_generated_hyphen(index) => "skip_generated_hyphen",
1430            Some(ch) if should_break_text_run(ch) => "break_whitespace_or_control",
1431            Some(_) => "include",
1432        };
1433
1434        let font_info = self.font_info(index);
1435        Ok(GeometryProbeChar {
1436            index,
1437            unicode,
1438            text: ch.map(|ch| ch.to_string()),
1439            parser_action: parser_action.to_string(),
1440            char_box: self.char_bbox(index, page_height_pts)?,
1441            loose_char_box: self.loose_char_bbox(index, page_height_pts)?,
1442            char_origin: self.char_origin(index, page_height_pts)?,
1443            font_id: font_info.font_id,
1444            font_flags: font_info.font_flags,
1445            font_size_q: self.font_size_q(index),
1446        })
1447    }
1448
1449    fn extract_runs(
1450        &self,
1451        page: &Page,
1452        page_height_pts: f64,
1453        next_span: &mut u32,
1454        spans: &mut Vec<Span>,
1455    ) -> Result<(), EthosError> {
1456        // SAFETY: handle is a live FPDF_TEXTPAGE.
1457        let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1458        if count < 0 {
1459            // A PDFium text-page failure invalidates extraction for the whole document.
1460            // Treating it as image-only would hide a backend read failure behind OCR fallback.
1461            return Err(EthosError::new(
1462                ErrorCode::CorruptPdf,
1463                "PDF text page could not be read",
1464            ));
1465        }
1466        if count == 0 {
1467            return Ok(());
1468        }
1469
1470        let mut run = SpanRun::default();
1471        for index in 0..count {
1472            // SAFETY: index is in 0..count for this text page.
1473            let codepoint = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1474            let Some(ch) = char::from_u32(codepoint) else {
1475                run.flush(page, next_span, spans)?;
1476                continue;
1477            };
1478            if self.is_generated_hyphen(index) {
1479                continue;
1480            }
1481            if should_break_text_run(ch) {
1482                run.flush(page, next_span, spans)?;
1483                continue;
1484            }
1485
1486            let Some(bbox) = self.char_bbox(index, page_height_pts)? else {
1487                run.flush(page, next_span, spans)?;
1488                continue;
1489            };
1490            let font_size_q = self.font_size_q(index);
1491            let font_info = self.font_info(index);
1492            if run.has_style_change(&font_info.font_id, font_size_q) {
1493                run.flush(page, next_span, spans)?;
1494            }
1495            let origin = self.char_origin(index, page_height_pts)?;
1496            run.push(ch, bbox, origin, font_info.font_id, font_size_q);
1497        }
1498        run.flush(page, next_span, spans)
1499    }
1500
1501    fn char_bbox(&self, index: c_int, page_height_pts: f64) -> Result<Option<QRect>, EthosError> {
1502        let mut left = 0.0f64;
1503        let mut right = 0.0f64;
1504        let mut bottom = 0.0f64;
1505        let mut top = 0.0f64;
1506        // SAFETY: all pointers refer to initialized local f64 values and index is in range.
1507        let ok = unsafe {
1508            (self.funcs.text_get_char_box)(
1509                self.handle,
1510                index,
1511                &mut left,
1512                &mut right,
1513                &mut bottom,
1514                &mut top,
1515            )
1516        };
1517        if ok == 0 {
1518            return Ok(None);
1519        }
1520        Ok(Some(qrect_from_pdfium_char_box(
1521            page_height_pts,
1522            left,
1523            right,
1524            bottom,
1525            top,
1526        )?))
1527    }
1528
1529    fn loose_char_bbox(
1530        &self,
1531        index: c_int,
1532        page_height_pts: f64,
1533    ) -> Result<Option<QRect>, EthosError> {
1534        let Some(get_loose_char_box) = self.funcs.text_get_loose_char_box else {
1535            return Ok(None);
1536        };
1537        let mut rect = FsRectF::default();
1538        // SAFETY: rect points to initialized writable storage and index is in range.
1539        let ok = unsafe { get_loose_char_box(self.handle, index, &mut rect) };
1540        if ok == 0 {
1541            return Ok(None);
1542        }
1543        Ok(Some(qrect_from_pdfium_char_box(
1544            page_height_pts,
1545            f64::from(rect.left),
1546            f64::from(rect.right),
1547            f64::from(rect.bottom),
1548            f64::from(rect.top),
1549        )?))
1550    }
1551
1552    fn char_origin(
1553        &self,
1554        index: c_int,
1555        page_height_pts: f64,
1556    ) -> Result<Option<[i64; 2]>, EthosError> {
1557        let Some(get_char_origin) = self.funcs.text_get_char_origin else {
1558            return Ok(None);
1559        };
1560        let mut x = 0.0f64;
1561        let mut y = 0.0f64;
1562        // SAFETY: pointers refer to initialized writable f64 values and index is in range.
1563        let ok = unsafe { get_char_origin(self.handle, index, &mut x, &mut y) };
1564        if ok == 0 {
1565            return Ok(None);
1566        }
1567        Ok(Some([
1568            quantize_coord(x)?,
1569            quantize_coord(page_height_pts - y)?,
1570        ]))
1571    }
1572
1573    fn text_rects(
1574        &self,
1575        char_start: c_int,
1576        char_count: c_int,
1577        page_height_pts: f64,
1578    ) -> Result<Vec<QRect>, EthosError> {
1579        let (Some(count_rects), Some(get_rect)) =
1580            (self.funcs.text_count_rects, self.funcs.text_get_rect)
1581        else {
1582            return Ok(Vec::new());
1583        };
1584        if char_count <= 0 {
1585            return Ok(Vec::new());
1586        }
1587        // SAFETY: char_start/char_count identify a range observed from this text page.
1588        let rect_count = unsafe { count_rects(self.handle, char_start, char_count) };
1589        if rect_count <= 0 {
1590            return Ok(Vec::new());
1591        }
1592        let mut rects = Vec::new();
1593        for rect_index in 0..rect_count {
1594            let mut left = 0.0f64;
1595            let mut top = 0.0f64;
1596            let mut right = 0.0f64;
1597            let mut bottom = 0.0f64;
1598            // SAFETY: pointers refer to initialized writable f64 values.
1599            let ok = unsafe {
1600                get_rect(
1601                    self.handle,
1602                    rect_index,
1603                    &mut left,
1604                    &mut top,
1605                    &mut right,
1606                    &mut bottom,
1607                )
1608            };
1609            if ok != 0 {
1610                rects.push(qrect_from_pdfium_char_box(
1611                    page_height_pts,
1612                    left,
1613                    right,
1614                    bottom,
1615                    top,
1616                )?);
1617            }
1618        }
1619        Ok(rects)
1620    }
1621
1622    fn font_size_q(&self, index: c_int) -> Option<i64> {
1623        // SAFETY: index is in range.
1624        let size = unsafe { (self.funcs.text_get_font_size)(self.handle, index) };
1625        if size <= 0.0 {
1626            return None;
1627        }
1628        quantize(size, QUANTUM_PER_POINT).ok()
1629    }
1630
1631    fn font_info(&self, index: c_int) -> PdfFontInfo {
1632        let Some(get_font_info) = self.funcs.text_get_font_info else {
1633            return PdfFontInfo::default();
1634        };
1635        // SAFETY: index is in range; null buffer asks PDFium for the UTF-8 byte length.
1636        let len =
1637            unsafe { (get_font_info)(self.handle, index, ptr::null_mut(), 0, ptr::null_mut()) };
1638        if len == 0 || len > 4096 {
1639            return PdfFontInfo::default();
1640        }
1641
1642        let Ok(len_usize) = usize::try_from(len) else {
1643            return PdfFontInfo::default();
1644        };
1645        let mut buffer = vec![0u8; len_usize];
1646        let mut flags = 0;
1647        // SAFETY: buffer is writable for len bytes; flags points to initialized storage.
1648        let written = unsafe {
1649            (get_font_info)(
1650                self.handle,
1651                index,
1652                buffer.as_mut_ptr().cast(),
1653                len,
1654                &mut flags,
1655            )
1656        };
1657        if written == 0 || written > len {
1658            return PdfFontInfo::default();
1659        }
1660        let nul = buffer.iter().position(|b| *b == 0).unwrap_or(buffer.len());
1661        let raw = std::str::from_utf8(&buffer[..nul]).ok();
1662        PdfFontInfo {
1663            font_id: raw.and_then(deterministic_font_id),
1664            font_flags: u32::try_from(flags).ok(),
1665        }
1666    }
1667
1668    fn is_generated_hyphen(&self, index: c_int) -> bool {
1669        let (Some(text_is_generated), Some(text_is_hyphen)) =
1670            (self.funcs.text_is_generated, self.funcs.text_is_hyphen)
1671        else {
1672            return false;
1673        };
1674        // SAFETY: index is in range for this text page.
1675        unsafe {
1676            text_is_generated(self.handle, index) == 1 && text_is_hyphen(self.handle, index) == 1
1677        }
1678    }
1679}
1680
1681impl Drop for PdfTextPage<'_> {
1682    fn drop(&mut self) {
1683        // SAFETY: handle is a live FPDF_TEXTPAGE and is closed exactly once here.
1684        unsafe { (self.funcs.text_close_page)(self.handle) };
1685    }
1686}
1687
1688fn should_break_text_run(ch: char) -> bool {
1689    ch == '\0' || ch.is_whitespace() || ch.is_control()
1690}
1691
1692#[derive(Default)]
1693struct SpanRun {
1694    text: String,
1695    bbox: Option<QRect>,
1696    first_origin: Option<[i64; 2]>,
1697    last_origin: Option<[i64; 2]>,
1698    font_id: Option<String>,
1699    font_size_q: Option<i64>,
1700}
1701
1702#[derive(Default)]
1703struct GeometryRunBuilder {
1704    text: String,
1705    char_indices: Vec<i32>,
1706    char_box_union: Option<QRect>,
1707    loose_char_box_union: Option<QRect>,
1708    first_origin: Option<[i64; 2]>,
1709    last_origin: Option<[i64; 2]>,
1710    font_id: Option<String>,
1711    font_size_q: Option<i64>,
1712    font_flags: Option<u32>,
1713}
1714
1715#[derive(Default)]
1716struct PdfFontInfo {
1717    font_id: Option<String>,
1718    font_flags: Option<u32>,
1719}
1720
1721#[derive(Debug, Deserialize)]
1722struct FontSubstitutionTable {
1723    schema_version: String,
1724    table_id: String,
1725    version: String,
1726    default_unresolved_font_id: String,
1727    mappings: Vec<FontSubstitutionMapping>,
1728}
1729
1730#[derive(Debug, Deserialize)]
1731struct FontSubstitutionMapping {
1732    source: String,
1733    font_id: String,
1734}
1735
1736impl SpanRun {
1737    fn has_style_change(&self, font_id: &Option<String>, font_size_q: Option<i64>) -> bool {
1738        !self.text.is_empty() && (self.font_id != *font_id || self.font_size_q != font_size_q)
1739    }
1740
1741    fn push(
1742        &mut self,
1743        ch: char,
1744        bbox: QRect,
1745        origin: Option<[i64; 2]>,
1746        font_id: Option<String>,
1747        font_size_q: Option<i64>,
1748    ) {
1749        self.text.push(ch);
1750        self.bbox = Some(match self.bbox {
1751            Some(existing) => union_rect(existing, bbox),
1752            None => bbox,
1753        });
1754        if self.first_origin.is_none() {
1755            self.first_origin = origin;
1756        }
1757        self.last_origin = origin;
1758        if self.font_id.is_none() {
1759            self.font_id = font_id;
1760        }
1761        if self.font_size_q.is_none() {
1762            self.font_size_q = font_size_q;
1763        }
1764    }
1765
1766    fn flush(
1767        &mut self,
1768        page: &Page,
1769        next_span: &mut u32,
1770        spans: &mut Vec<Span>,
1771    ) -> Result<(), EthosError> {
1772        if self.text.is_empty() {
1773            return Ok(());
1774        }
1775        let bbox = self
1776            .bbox
1777            .ok_or_else(|| EthosError::internal("span run has text without bbox"))?;
1778        let origin_locator = match (self.first_origin.take(), self.last_origin.take()) {
1779            (Some(first_origin), Some(last_origin)) => Some(SpanOriginLocator {
1780                policy: ORIGIN_LOCATOR_POLICY.to_string(),
1781                first_origin,
1782                last_origin,
1783            }),
1784            _ => None,
1785        };
1786        spans.push(Span {
1787            id: span_id(*next_span)?,
1788            page: page.id.clone(),
1789            bbox,
1790            origin_locator,
1791            text: std::mem::take(&mut self.text),
1792            font_id: self.font_id.take(),
1793            font_size_q: self.font_size_q,
1794            char_start: None,
1795            char_end: None,
1796            warning_refs: Vec::new(),
1797        });
1798        *next_span += 1;
1799        self.bbox = None;
1800        self.first_origin = None;
1801        self.last_origin = None;
1802        self.font_id = None;
1803        self.font_size_q = None;
1804        Ok(())
1805    }
1806}
1807
1808impl GeometryRunBuilder {
1809    fn has_style_change(
1810        &self,
1811        font_id: &Option<String>,
1812        font_size_q: Option<i64>,
1813        font_flags: Option<u32>,
1814    ) -> bool {
1815        !self.text.is_empty()
1816            && (self.font_id != *font_id
1817                || self.font_size_q != font_size_q
1818                || self.font_flags != font_flags)
1819    }
1820
1821    fn push(&mut self, ch: &GeometryProbeChar) {
1822        if let Some(text) = &ch.text {
1823            self.text.push_str(text);
1824        }
1825        self.char_indices.push(ch.index);
1826        self.char_box_union = union_option_rect(self.char_box_union, ch.char_box);
1827        self.loose_char_box_union = union_option_rect(self.loose_char_box_union, ch.loose_char_box);
1828        if self.first_origin.is_none() {
1829            self.first_origin = ch.char_origin;
1830        }
1831        self.last_origin = ch.char_origin;
1832        if self.font_id.is_none() {
1833            self.font_id = ch.font_id.clone();
1834        }
1835        if self.font_size_q.is_none() {
1836            self.font_size_q = ch.font_size_q;
1837        }
1838        if self.font_flags.is_none() {
1839            self.font_flags = ch.font_flags;
1840        }
1841    }
1842
1843    fn flush(
1844        &mut self,
1845        text_page: &PdfTextPage<'_>,
1846        page_height_pts: f64,
1847        next_run: &mut u32,
1848        runs: &mut Vec<GeometryProbeRun>,
1849    ) -> Result<(), EthosError> {
1850        if self.text.is_empty() {
1851            return Ok(());
1852        }
1853        let char_start = self.char_indices.first().copied().unwrap_or_default();
1854        let char_end = self
1855            .char_indices
1856            .last()
1857            .copied()
1858            .map(|index| index + 1)
1859            .unwrap_or(char_start);
1860        let text_rects =
1861            text_page.text_rects(char_start, char_end - char_start, page_height_pts)?;
1862        runs.push(GeometryProbeRun {
1863            index: *next_run,
1864            text: std::mem::take(&mut self.text),
1865            char_start,
1866            char_end,
1867            char_indices: std::mem::take(&mut self.char_indices),
1868            char_box_union: self.char_box_union.take(),
1869            loose_char_box_union: self.loose_char_box_union.take(),
1870            text_rect_union: union_rects(text_rects.iter().copied()),
1871            text_rects,
1872            first_origin: self.first_origin.take(),
1873            last_origin: self.last_origin.take(),
1874            font_id: self.font_id.take(),
1875            font_flags: self.font_flags.take(),
1876            font_size_q: self.font_size_q.take(),
1877        });
1878        *next_run += 1;
1879        self.font_size_q = None;
1880        self.font_flags = None;
1881        Ok(())
1882    }
1883}
1884
1885fn union_option_rect(existing: Option<QRect>, next: Option<QRect>) -> Option<QRect> {
1886    match (existing, next) {
1887        (Some(a), Some(b)) => Some(union_rect(a, b)),
1888        (Some(a), None) => Some(a),
1889        (None, Some(b)) => Some(b),
1890        (None, None) => None,
1891    }
1892}
1893
1894fn union_rects(mut rects: impl Iterator<Item = QRect>) -> Option<QRect> {
1895    let first = rects.next()?;
1896    Some(rects.fold(first, union_rect))
1897}
1898
1899fn deterministic_font_id(raw_name: &str) -> Option<String> {
1900    let raw_name = raw_name.trim();
1901    if raw_name.is_empty() {
1902        return None;
1903    }
1904    let (name, subset) = strip_subset_prefix(raw_name);
1905    if subset {
1906        if let Some(normalized) = normalize_font_name(name) {
1907            if is_safe_font_id_suffix(&normalized) {
1908                return Some(format!("embedded:{normalized}"));
1909            }
1910        }
1911        return Some(hashed_embedded_font_id(name));
1912    }
1913    let normalized = normalize_font_name(name)?;
1914    font_substitution(&normalized)
1915        .or_else(|| Some(font_substitution_table().default_unresolved_font_id.clone()))
1916}
1917
1918fn strip_subset_prefix(name: &str) -> (&str, bool) {
1919    let bytes = name.as_bytes();
1920    if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(u8::is_ascii_uppercase) {
1921        (&name[7..], true)
1922    } else {
1923        (name, false)
1924    }
1925}
1926
1927fn normalize_font_name(name: &str) -> Option<String> {
1928    let mut out = String::new();
1929    let mut previous_dash = false;
1930    for ch in name.trim().chars() {
1931        let mapped = if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
1932            ch
1933        } else if ch.is_whitespace()
1934            || ch.is_control()
1935            || matches!(ch, '/' | '\\' | ':' | ',' | '(' | ')' | '[' | ']')
1936        {
1937            '-'
1938        } else {
1939            ch
1940        };
1941        if mapped == '-' {
1942            if previous_dash {
1943                continue;
1944            }
1945            previous_dash = true;
1946        } else {
1947            previous_dash = false;
1948        }
1949        out.push(mapped);
1950    }
1951    let out = out.trim_matches('-').to_string();
1952    (!out.is_empty()).then_some(out)
1953}
1954
1955fn is_safe_font_id_suffix(name: &str) -> bool {
1956    !name.is_empty()
1957        && name
1958            .bytes()
1959            .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b'.'))
1960}
1961
1962fn hashed_embedded_font_id(name: &str) -> String {
1963    format!(
1964        "embedded:sha256-{}",
1965        ethos_core::c14n::sha256_hex_bytes(name.as_bytes())
1966    )
1967}
1968
1969fn font_substitution(name: &str) -> Option<String> {
1970    font_substitution_table()
1971        .mappings
1972        .iter()
1973        .find(|mapping| mapping.source == name)
1974        .map(|mapping| mapping.font_id.clone())
1975}
1976
1977fn font_substitution_table() -> &'static FontSubstitutionTable {
1978    FONT_SUBSTITUTION_TABLE.get_or_init(|| {
1979        let table: FontSubstitutionTable = serde_json::from_str(FONT_SUBSTITUTION_TABLE_JSON)
1980            .expect("bundled font-substitution-table.json is valid JSON");
1981        validate_font_substitution_table(&table)
1982            .expect("bundled font-substitution-table.json is internally valid");
1983        table
1984    })
1985}
1986
1987fn validate_font_substitution_table(table: &FontSubstitutionTable) -> Result<(), &'static str> {
1988    if table.schema_version != "1.0.0"
1989        || table.table_id != "ethos-font-substitution-v1"
1990        || table.version != "1.0.0"
1991        || table.default_unresolved_font_id != "subst:liberation-sans-regular"
1992    {
1993        return Err("unexpected font substitution table metadata");
1994    }
1995
1996    let mut seen = HashSet::new();
1997    for mapping in &table.mappings {
1998        if mapping.source.is_empty() || !mapping.font_id.starts_with("subst:") {
1999            return Err("malformed font substitution mapping");
2000        }
2001        if !seen.insert(mapping.source.as_str()) {
2002            return Err("duplicate font substitution mapping source");
2003        }
2004    }
2005
2006    Ok(())
2007}
2008
2009#[cfg(unix)]
2010mod dylib {
2011    use super::*;
2012    use std::os::unix::ffi::OsStrExt;
2013
2014    const RTLD_NOW: c_int = 2;
2015
2016    unsafe extern "C" {
2017        fn dlopen(filename: *const c_char, flag: c_int) -> *mut c_void;
2018        fn dlsym(handle: *mut c_void, symbol: *const c_char) -> *mut c_void;
2019        fn dlclose(handle: *mut c_void) -> c_int;
2020    }
2021
2022    pub(super) struct Library {
2023        handle: *mut c_void,
2024    }
2025
2026    impl Library {
2027        pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2028            let c_path = CString::new(path.as_os_str().as_bytes()).map_err(|_| {
2029                EthosError::internal("pdfium library path contains an interior NUL byte")
2030            })?;
2031            // SAFETY: c_path is NUL-terminated and lives for the call.
2032            let handle = unsafe { dlopen(c_path.as_ptr(), RTLD_NOW) };
2033            if handle.is_null() {
2034                Err(EthosError::internal(
2035                    "failed to load configured pdfium library",
2036                ))
2037            } else {
2038                Ok(Library { handle })
2039            }
2040        }
2041
2042        pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2043            let ptr = self.symbol_ptr(name);
2044            if ptr.is_null() {
2045                return Err(EthosError::internal(format!(
2046                    "pdfium library is missing symbol {}",
2047                    symbol_name(name)
2048                )));
2049            }
2050            assert_symbol_pointer_size::<T>();
2051            // SAFETY: caller chooses T to match the named PDFium C symbol.
2052            Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2053        }
2054
2055        pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2056            let ptr = self.symbol_ptr(name);
2057            if ptr.is_null() {
2058                None
2059            } else {
2060                assert_symbol_pointer_size::<T>();
2061                // SAFETY: caller chooses T to match the named PDFium C symbol.
2062                Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2063            }
2064        }
2065
2066        fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2067            // SAFETY: handle is live; name is a static NUL-terminated symbol name.
2068            unsafe { dlsym(self.handle, name.as_ptr().cast()) }
2069        }
2070    }
2071
2072    impl Drop for Library {
2073        fn drop(&mut self) {
2074            if !self.handle.is_null() {
2075                // SAFETY: handle was returned by dlopen and is closed exactly once.
2076                unsafe {
2077                    let _ = dlclose(self.handle);
2078                }
2079            }
2080        }
2081    }
2082}
2083
2084#[cfg(windows)]
2085mod dylib {
2086    use super::*;
2087    use std::os::windows::ffi::OsStrExt;
2088
2089    unsafe extern "system" {
2090        fn LoadLibraryW(lp_lib_file_name: *const u16) -> *mut c_void;
2091        fn GetProcAddress(h_module: *mut c_void, lp_proc_name: *const c_char) -> *mut c_void;
2092        fn FreeLibrary(h_lib_module: *mut c_void) -> c_int;
2093    }
2094
2095    pub(super) struct Library {
2096        handle: *mut c_void,
2097    }
2098
2099    impl Library {
2100        pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2101            let mut wide_path: Vec<u16> = path.as_os_str().encode_wide().collect();
2102            if wide_path.contains(&0) {
2103                return Err(EthosError::internal(
2104                    "pdfium library path contains an interior NUL code unit",
2105                ));
2106            }
2107            wide_path.push(0);
2108            // SAFETY: wide_path is NUL-terminated and lives for the call.
2109            let handle = unsafe { LoadLibraryW(wide_path.as_ptr()) };
2110            if handle.is_null() {
2111                Err(EthosError::internal(
2112                    "failed to load configured pdfium library",
2113                ))
2114            } else {
2115                Ok(Library { handle })
2116            }
2117        }
2118
2119        pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2120            let ptr = self.symbol_ptr(name);
2121            if ptr.is_null() {
2122                return Err(EthosError::internal(format!(
2123                    "pdfium library is missing symbol {}",
2124                    symbol_name(name)
2125                )));
2126            }
2127            assert_symbol_pointer_size::<T>();
2128            // SAFETY: caller chooses T to match the named PDFium C symbol.
2129            Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2130        }
2131
2132        pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2133            let ptr = self.symbol_ptr(name);
2134            if ptr.is_null() {
2135                None
2136            } else {
2137                assert_symbol_pointer_size::<T>();
2138                // SAFETY: caller chooses T to match the named PDFium C symbol.
2139                Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2140            }
2141        }
2142
2143        fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2144            // SAFETY: handle is live; name is a static NUL-terminated symbol name.
2145            unsafe { GetProcAddress(self.handle, name.as_ptr().cast()) }
2146        }
2147    }
2148
2149    impl Drop for Library {
2150        fn drop(&mut self) {
2151            if !self.handle.is_null() {
2152                // SAFETY: handle was returned by LoadLibraryW and is closed exactly once.
2153                unsafe {
2154                    let _ = FreeLibrary(self.handle);
2155                }
2156            }
2157        }
2158    }
2159}
2160
2161fn assert_symbol_pointer_size<T>() {
2162    const {
2163        assert!(
2164            std::mem::size_of::<T>() == std::mem::size_of::<*mut c_void>(),
2165            "pdfium symbol pointer size mismatch"
2166        );
2167    }
2168}
2169
2170fn symbol_name(name: &'static [u8]) -> String {
2171    let name = name.strip_suffix(b"\0").unwrap_or(name);
2172    String::from_utf8_lossy(name).into_owned()
2173}
2174
2175#[cfg(test)]
2176mod tests {
2177    use super::*;
2178
2179    #[test]
2180    fn invalid_pdf_fails_before_library_load() {
2181        let err = PdfiumBackend::default()
2182            .page_count(b"not a pdf")
2183            .unwrap_err();
2184        assert_eq!(err.code, ErrorCode::InvalidPdf);
2185    }
2186
2187    #[test]
2188    fn text_run_breaks_on_pdfium_control_characters() {
2189        assert!(should_break_text_run('\0'));
2190        assert!(should_break_text_run('\n'));
2191        assert!(should_break_text_run('\u{0002}'));
2192        assert!(!should_break_text_run('-'));
2193        assert!(!should_break_text_run('A'));
2194    }
2195
2196    #[test]
2197    fn missing_library_path_is_stable_error_for_pdf_input() {
2198        let backend = PdfiumBackend::default();
2199        if env::var_os(PDFIUM_LIBRARY_PATH_ENV).is_some() {
2200            return;
2201        }
2202        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2203        assert_eq!(err.code, ErrorCode::InternalError);
2204        assert!(err.message.contains(PDFIUM_LIBRARY_PATH_ENV));
2205    }
2206
2207    #[test]
2208    fn render_crop_raw_rejects_zero_page_before_library_load() {
2209        let err = PdfiumBackend::default()
2210            .render_crop_raw(b"%PDF-1.7\n", 0, QRect::new(0, 0, 100, 100).unwrap())
2211            .unwrap_err();
2212        assert_eq!(err.code, ErrorCode::PageLimitExceeded);
2213        assert_eq!(err.message, "page selection out of document range");
2214    }
2215
2216    #[test]
2217    fn crop_window_uses_outward_quantized_pixel_bounds() {
2218        assert_eq!(
2219            crop_window(QRect::new(7392, 5482, 19378, 7226).unwrap(), 300, 144).unwrap(),
2220            (73, 54, 121, 19)
2221        );
2222        assert_eq!(
2223            crop_window(QRect::new(-50, -50, 30100, 14500).unwrap(), 300, 144).unwrap(),
2224            (0, 0, 300, 144)
2225        );
2226
2227        let err = crop_window(QRect::new(100, 100, 101, 101).unwrap(), 1, 1).unwrap_err();
2228        assert_eq!(err.code, ErrorCode::InternalError);
2229        assert_eq!(err.message, "crop bbox has no positive pixel extent");
2230    }
2231
2232    #[test]
2233    fn render_crop_raw_is_deterministic_when_pdfium_is_configured() {
2234        let Some(path) = env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from) else {
2235            return;
2236        };
2237        if !path.is_file() {
2238            return;
2239        }
2240
2241        let fixture = Path::new(env!("CARGO_MANIFEST_DIR"))
2242            .join("../../fixtures/synthetic/simple-text/document.pdf");
2243        let pdf_bytes = std::fs::read(fixture).unwrap();
2244        let bbox = QRect::new(7392, 5482, 19378, 7226).unwrap();
2245        let backend = PdfiumBackend::default();
2246
2247        let first = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2248        let second = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2249
2250        assert_eq!(first, second);
2251        assert_eq!(first.page_index, 1);
2252        assert_eq!(first.bbox, bbox);
2253        assert_eq!(first.width_px, 121);
2254        assert_eq!(first.height_px, 19);
2255        assert_eq!(first.stride, first.width_px * 4);
2256        assert_eq!(first.pixel_format, "bgra_8u");
2257        assert_eq!(
2258            first.bytes.len(),
2259            usize::try_from(first.stride * first.height_px).unwrap()
2260        );
2261        assert_eq!(
2262            first.sha256,
2263            ethos_core::c14n::sha256_hex_bytes(&first.bytes)
2264        );
2265        assert!(first
2266            .bytes
2267            .chunks_exact(4)
2268            .any(|pixel| pixel != [255, 255, 255, 255]));
2269    }
2270
2271    #[test]
2272    fn invalid_configured_library_path_does_not_leak_host_path() {
2273        let path = env::temp_dir().join("ethos-missing-libpdfium\nwith-control.dylib");
2274        let backend = PdfiumBackend::from_library_path(&path);
2275        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2276        assert_eq!(err.code, ErrorCode::InternalError);
2277        assert_eq!(err.message, "pdfium library path does not point to a file");
2278        assert!(!err.message.contains(path.to_string_lossy().as_ref()));
2279    }
2280
2281    #[test]
2282    fn explicit_manifest_hashes_library_bytes() {
2283        let path = env::temp_dir().join("ethos-test-libpdfium-hash.bin");
2284        std::fs::write(&path, b"pdfium bytes").unwrap();
2285        let backend = PdfiumBackend::from_library_path(&path).with_version("test-version");
2286        let manifest = backend.manifest();
2287        assert_eq!(manifest.id, "pdfium");
2288        assert_eq!(manifest.phase, 1);
2289        assert_eq!(manifest.version, "test-version");
2290        assert_eq!(
2291            manifest.platform_sha256,
2292            ethos_core::c14n::sha256_hex_bytes(b"pdfium bytes")
2293        );
2294        let _ = std::fs::remove_file(path);
2295    }
2296
2297    #[test]
2298    fn phase1_pdfium_profile_is_pinned_and_v8_xfa_disabled() {
2299        let profile = pinned_pdfium_profile();
2300        assert_eq!(profile.id, "pdfium");
2301        assert_eq!(profile.phase, 1);
2302        assert_eq!(profile.version, "chromium/7881");
2303        assert_eq!(profile.upstream_version, "PDFium 151.0.7881.0");
2304        assert_eq!(profile.v8, "disabled");
2305        assert_eq!(profile.xfa, "disabled");
2306        assert_eq!(profile.distribution.source, "bblanchon/pdfium-binaries");
2307        assert_eq!(
2308            profile.distribution.attestation.sha256,
2309            "24dec7cd76acb81106a0c29b908cceceef8215b050f6ff6ffbf875465811ef60"
2310        );
2311        assert!(!profile.build_flags.pdf_enable_v8);
2312        assert!(!profile.build_flags.pdf_enable_xfa);
2313        assert!(profile.build_flags.pdf_is_standalone);
2314
2315        let expected = [
2316            (
2317                "macos-arm64",
2318                "pdfium-mac-arm64.tgz",
2319                "52e94ca5aa8847934330daf3f8150c190682c5ca93831468794f8b90d4392e40",
2320                "lib/libpdfium.dylib",
2321                "1bc45b15466b34cef96641ce25c77a876e70010c6b114f909dda2f5325fc5bd7",
2322            ),
2323            (
2324                "linux-x64",
2325                "pdfium-linux-x64.tgz",
2326                "1470e21b8b4a3b4ad7f85684e2da11d94f3b69a86d81dee11b9b6709d927ac1d",
2327                "lib/libpdfium.so",
2328                "f728930966f503652b92acc89b9374a2eeca00ce42e26dccd3e4b5c5161b2d64",
2329            ),
2330            (
2331                "windows-x64",
2332                "pdfium-win-x64.tgz",
2333                "73cc0de638ac2095e7445bf56a38200a5b7c7ca0e9f4ba144598f2457377ac08",
2334                "bin/pdfium.dll",
2335                "79d4676b656cfb1abcea88f9ade3b4b0826c5200382db5f4ec72a636c598c118",
2336            ),
2337        ];
2338        for (platform, name, archive_sha256, runtime_path, runtime_sha256) in expected {
2339            assert_eq!(profile.platform_hashes[platform], archive_sha256);
2340            let artifact = &profile.platform_artifacts[platform];
2341            assert_eq!(artifact.name, name);
2342            assert!(!artifact.name.contains("-v8-"));
2343            assert!(!artifact.name.contains("xfa"));
2344            assert_eq!(artifact.runtime_library_path, runtime_path);
2345            assert_eq!(artifact.runtime_library_sha256, runtime_sha256);
2346        }
2347    }
2348
2349    #[test]
2350    fn mismatched_pdfium_version_is_rejected_before_library_load() {
2351        if current_platform_key().is_none() {
2352            return;
2353        }
2354        let path = env::temp_dir().join("ethos-test-libpdfium-version-mismatch.bin");
2355        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2356        let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7869");
2357        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2358        assert_eq!(err.code, ErrorCode::InternalError);
2359        assert_eq!(
2360            err.message,
2361            "pdfium version does not match pinned phase 1 profile"
2362        );
2363        let _ = std::fs::remove_file(path);
2364    }
2365
2366    #[test]
2367    fn pinned_upstream_pdfium_version_alias_is_accepted() {
2368        if current_platform_key().is_none() {
2369            return;
2370        }
2371        let path = env::temp_dir().join("ethos-test-libpdfium-upstream-version.bin");
2372        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2373        let backend = PdfiumBackend::from_library_path(&path).with_version("PDFium 151.0.7881.0");
2374        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2375        assert_eq!(err.code, ErrorCode::InternalError);
2376        assert_eq!(
2377            err.message,
2378            "pdfium library does not match pinned phase 1 profile"
2379        );
2380        let _ = std::fs::remove_file(path);
2381    }
2382
2383    #[test]
2384    fn mismatched_pdfium_artifact_is_rejected_with_stable_error() {
2385        if current_platform_key().is_none() {
2386            return;
2387        }
2388        let library_path = env::temp_dir().join("ethos-test-libpdfium-artifact-mismatch.bin");
2389        let artifact_path = env::temp_dir().join("ethos-test-pdfium-artifact-mismatch.tgz");
2390        std::fs::write(&library_path, b"not the pinned pdfium library").unwrap();
2391        std::fs::write(&artifact_path, b"not the pinned pdfium artifact").unwrap();
2392        let backend = PdfiumBackend::from_library_path(&library_path)
2393            .with_version("chromium/7881")
2394            .with_artifact_path(&artifact_path);
2395        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2396        assert_eq!(err.code, ErrorCode::InternalError);
2397        assert_eq!(
2398            err.message,
2399            "pdfium artifact does not match pinned phase 1 profile"
2400        );
2401        let _ = std::fs::remove_file(library_path);
2402        let _ = std::fs::remove_file(artifact_path);
2403    }
2404
2405    #[test]
2406    fn mismatched_pdfium_library_is_rejected_before_dynamic_load() {
2407        if current_platform_key().is_none() {
2408            return;
2409        }
2410        let path = env::temp_dir().join("ethos-test-libpdfium-library-mismatch.bin");
2411        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2412        let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7881");
2413        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2414        assert_eq!(err.code, ErrorCode::InternalError);
2415        assert_eq!(
2416            err.message,
2417            "pdfium library does not match pinned phase 1 profile"
2418        );
2419        let _ = std::fs::remove_file(path);
2420    }
2421
2422    #[test]
2423    fn deterministic_font_ids_strip_subset_prefixes() {
2424        assert_eq!(
2425            deterministic_font_id("ABCDEF+MinionPro-Regular").as_deref(),
2426            Some("embedded:MinionPro-Regular")
2427        );
2428        assert_eq!(
2429            deterministic_font_id("Helvetica-Bold").as_deref(),
2430            Some("subst:liberation-sans-bold")
2431        );
2432        assert_eq!(
2433            deterministic_font_id("Helvetica").as_deref(),
2434            Some("subst:liberation-sans-regular")
2435        );
2436        assert_eq!(
2437            deterministic_font_id("Helvetica-Oblique").as_deref(),
2438            Some("subst:liberation-sans-italic")
2439        );
2440        assert_eq!(
2441            deterministic_font_id("Helvetica-BoldOblique").as_deref(),
2442            Some("subst:liberation-sans-bold-italic")
2443        );
2444        assert_eq!(
2445            deterministic_font_id("Courier").as_deref(),
2446            Some("subst:liberation-mono-regular")
2447        );
2448        assert_eq!(
2449            deterministic_font_id("Times-Roman").as_deref(),
2450            Some("subst:liberation-serif-regular")
2451        );
2452        assert_eq!(
2453            deterministic_font_id("Custom Font/Regular").as_deref(),
2454            Some("subst:liberation-sans-regular")
2455        );
2456        assert_eq!(deterministic_font_id("   "), None);
2457    }
2458
2459    #[test]
2460    fn deterministic_font_ids_keep_embedded_ids_ascii_only() {
2461        let unsafe_unicode = deterministic_font_id("ABCDEF+明朝").unwrap();
2462        assert_eq!(unsafe_unicode, hashed_embedded_font_id("明朝"));
2463        assert!(unsafe_unicode.is_ascii());
2464
2465        let unsafe_punctuation = deterministic_font_id("ABCDEF+Fixture+Font").unwrap();
2466        assert_eq!(unsafe_punctuation, hashed_embedded_font_id("Fixture+Font"));
2467        assert!(unsafe_punctuation.is_ascii());
2468
2469        let separator_only = deterministic_font_id("ABCDEF+///").unwrap();
2470        assert_eq!(separator_only, hashed_embedded_font_id("///"));
2471        assert!(separator_only.is_ascii());
2472
2473        assert_eq!(
2474            deterministic_font_id("明朝").as_deref(),
2475            Some("subst:liberation-sans-regular")
2476        );
2477    }
2478
2479    #[test]
2480    fn font_substitution_table_is_well_formed() {
2481        use std::collections::HashSet;
2482
2483        let table = font_substitution_table();
2484        assert_eq!(table.schema_version, "1.0.0");
2485        assert_eq!(table.table_id, "ethos-font-substitution-v1");
2486        assert_eq!(table.version, "1.0.0");
2487        assert_eq!(
2488            table.default_unresolved_font_id,
2489            "subst:liberation-sans-regular"
2490        );
2491
2492        let mut seen = HashSet::new();
2493        for mapping in &table.mappings {
2494            assert!(!mapping.source.is_empty());
2495            assert!(mapping.font_id.starts_with("subst:"));
2496            assert!(
2497                seen.insert(mapping.source.as_str()),
2498                "duplicate font substitution source {}",
2499                mapping.source
2500            );
2501        }
2502        assert_eq!(table.mappings.len(), 14);
2503    }
2504
2505    #[test]
2506    fn profile_pins_font_substitution_table_bytes() {
2507        const FONT_SUBSTITUTION_TABLE_PATH: &str =
2508            "crates/ethos-pdf/assets/font-substitution-table.json";
2509        let profile: serde_json::Value = serde_json::from_str(include_str!(concat!(
2510            env!("CARGO_MANIFEST_DIR"),
2511            "/../../profiles/ethos-deterministic-v1.json"
2512        )))
2513        .unwrap();
2514        let pin = &profile["font_policy"]["substitution_table"];
2515        assert_eq!(pin["path"], FONT_SUBSTITUTION_TABLE_PATH);
2516        assert_eq!(
2517            pin["sha256"],
2518            ethos_core::c14n::sha256_hex_bytes(FONT_SUBSTITUTION_TABLE_JSON.as_bytes())
2519        );
2520    }
2521}