Skip to main content

ethos_pdf/
lib.rs

1/*
2 * Copyright 2026 The Ethos maintainers
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//! # ethos-pdf — WS-ENGINE lane (Milestone A)
18//!
19//! The only crate that loads PDFium (invariant 3). Everything crossing
20//! [`EthosPdfBackend`] is already normalized + quantized (invariant 1: quantize-at-
21//! extraction lives here); public schemas/APIs never see PDFium types.
22//!
23//! This first WS-ENGINE slice uses a small dynamic FFI boundary over the PDFium C API.
24//! Runtime loading is explicit through `ETHOS_PDFIUM_LIBRARY_PATH`, so parser output
25//! cannot accidentally depend on an unknown library from a host search path.
26
27#![deny(unsafe_op_in_unsafe_fn)]
28#![warn(missing_docs)]
29
30use std::collections::{BTreeMap, HashSet};
31use std::env;
32use std::ffi::{c_char, c_int, c_ulong, c_void, CString};
33use std::path::{Path, PathBuf};
34use std::ptr;
35use std::slice;
36use std::sync::{Mutex, OnceLock};
37
38use ethos_core::codes::WarningCode;
39use ethos_core::config::{PageSelection, ParseConfig};
40use ethos_core::error::{ErrorCode, EthosError};
41use ethos_core::geom::{quantize, QRect};
42use ethos_core::ids::{page_id, span_id, warning_id};
43use ethos_core::model::{Page, Span, SpanOriginLocator, Warning};
44use ethos_core::traits::{BackendManifest, EthosPdfBackend, Extraction};
45use serde::{Deserialize, Serialize};
46
47/// Environment variable containing the exact PDFium dynamic library path.
48pub const PDFIUM_LIBRARY_PATH_ENV: &str = "ETHOS_PDFIUM_LIBRARY_PATH";
49
50/// Optional environment variable carrying the pinned PDFium release/version string.
51pub const PDFIUM_VERSION_ENV: &str = "ETHOS_PDFIUM_VERSION";
52
53/// Optional environment variable containing the downloaded Phase 1 release artifact path.
54pub const PDFIUM_ARTIFACT_PATH_ENV: &str = "ETHOS_PDFIUM_ARTIFACT_PATH";
55
56const PDFIUM_SETUP_GUIDANCE: &str =
57    "Run ethos doctor for setup diagnostics, run ethos doctor --require-pdfium after setting it, and see docs/pdfium-manual-setup.md.";
58
59/// Profile quantization: 100 quanta per PDF point.
60pub const QUANTUM_PER_POINT: u32 = 100;
61const ORIGIN_LOCATOR_POLICY: &str = "origin-run-locator-v1";
62
63const DETERMINISTIC_PROFILE_JSON: &str = include_str!("../assets/ethos-deterministic-v1.json");
64const FONT_SUBSTITUTION_TABLE_JSON: &str = include_str!("../assets/font-substitution-table.json");
65
66/// PDFium has process-global library state; serialize init/load/destroy for now.
67static PDFIUM_LOCK: Mutex<()> = Mutex::new(());
68static PINNED_PDFIUM_PROFILE: OnceLock<PinnedPdfiumBackend> = OnceLock::new();
69static FONT_SUBSTITUTION_TABLE: OnceLock<FontSubstitutionTable> = OnceLock::new();
70
71/// PDFium backend implementation.
72#[derive(Debug, Clone, Default)]
73pub struct PdfiumBackend {
74    library_path: Option<PathBuf>,
75    artifact_path: Option<PathBuf>,
76    version: Option<String>,
77}
78
79/// Debug-only report of PDFium text geometry signals.
80///
81/// This is not part of the canonical document contract. It exists so Gate Zero
82/// investigations can compare native PDFium geometry sources across platforms
83/// before changing parser output or fingerprint policy.
84#[derive(Debug, Serialize)]
85pub struct GeometryProbeReport {
86    /// Report schema identifier.
87    pub schema_version: String,
88    /// Quantization used for every reported coordinate.
89    pub quantum_per_point: u32,
90    /// Backend manifest for the loaded PDFium runtime.
91    pub backend: BackendManifest,
92    /// Probed pages.
93    pub pages: Vec<GeometryProbePage>,
94}
95
96/// Per-page debug geometry signals.
97#[derive(Debug, Serialize)]
98pub struct GeometryProbePage {
99    /// Canonical page id.
100    pub id: String,
101    /// 1-based original page index.
102    pub index: u32,
103    /// Quantized page width.
104    pub width: i64,
105    /// Quantized page height.
106    pub height: i64,
107    /// Page rotation in degrees.
108    pub rotation: u16,
109    /// PDFium text character count.
110    pub char_count: i32,
111    /// Optional PDFium text symbols available in this runtime.
112    pub symbols: GeometryProbeSymbols,
113    /// Per-character geometry records.
114    pub chars: Vec<GeometryProbeChar>,
115    /// Parser-like text runs with alternative geometry unions.
116    pub runs: Vec<GeometryProbeRun>,
117}
118
119/// Optional PDFium geometry symbols discovered at runtime.
120#[derive(Debug, Serialize)]
121pub struct GeometryProbeSymbols {
122    /// Whether FPDFText_GetCharOrigin is available.
123    pub char_origin: bool,
124    /// Whether FPDFText_GetLooseCharBox is available.
125    pub loose_char_box: bool,
126    /// Whether FPDFText_CountRects and FPDFText_GetRect are available.
127    pub text_rects: bool,
128}
129
130/// Per-character geometry probe record.
131#[derive(Debug, Serialize)]
132pub struct GeometryProbeChar {
133    /// Zero-based PDFium character index.
134    pub index: i32,
135    /// Unicode scalar value reported by PDFium.
136    pub unicode: u32,
137    /// Character as a string when it is a valid scalar value.
138    pub text: Option<String>,
139    /// Why this character would break or be skipped by the parser run builder.
140    pub parser_action: String,
141    /// Current parser-critical FPDFText_GetCharBox geometry.
142    pub char_box: Option<QRect>,
143    /// FPDFText_GetLooseCharBox geometry when the symbol is present.
144    pub loose_char_box: Option<QRect>,
145    /// FPDFText_GetCharOrigin point when the symbol is present.
146    pub char_origin: Option<[i64; 2]>,
147    /// Deterministic font id used by the parser.
148    pub font_id: Option<String>,
149    /// PDFium font descriptor flags used by the parser.
150    pub font_flags: Option<u32>,
151    /// Quantized font size used by the parser.
152    pub font_size_q: Option<i64>,
153}
154
155/// Parser-like text run with alternative PDFium geometry sources.
156#[derive(Debug, Serialize)]
157pub struct GeometryProbeRun {
158    /// One-based run index on this page.
159    pub index: u32,
160    /// Run text after parser skip/break rules.
161    pub text: String,
162    /// First included PDFium character index.
163    pub char_start: i32,
164    /// Exclusive end PDFium character index.
165    pub char_end: i32,
166    /// Included character indices.
167    pub char_indices: Vec<i32>,
168    /// Current parser span bbox: union of FPDFText_GetCharBox records.
169    pub char_box_union: Option<QRect>,
170    /// Union of FPDFText_GetLooseCharBox records when available.
171    pub loose_char_box_union: Option<QRect>,
172    /// Rectangles from FPDFText_CountRects/GetRect for the run range when available.
173    pub text_rects: Vec<QRect>,
174    /// Union of text_rects when available.
175    pub text_rect_union: Option<QRect>,
176    /// Origin of first included character when available.
177    pub first_origin: Option<[i64; 2]>,
178    /// Origin of last included character when available.
179    pub last_origin: Option<[i64; 2]>,
180    /// Deterministic font id used by the parser.
181    pub font_id: Option<String>,
182    /// PDFium font descriptor flags used by the parser.
183    pub font_flags: Option<u32>,
184    /// Quantized font size used by the parser.
185    pub font_size_q: Option<i64>,
186}
187
188/// Raw crop rendered from a PDF page.
189///
190/// This is the pre-encoding renderer boundary used by `ethos-render` work. It
191/// deliberately exposes raw BGRA bytes and a byte hash before PNG/JPEG encoding
192/// is added, so callers can test the renderer itself before artifact encoding.
193#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RawCrop {
195    /// 1-based source page index.
196    pub page_index: u32,
197    /// Source bbox in Ethos quantized top-left coordinates.
198    pub bbox: QRect,
199    /// Crop width in pixels.
200    pub width_px: u32,
201    /// Crop height in pixels.
202    pub height_px: u32,
203    /// Bytes per crop row.
204    pub stride: u32,
205    /// Pixel format for `bytes`.
206    pub pixel_format: &'static str,
207    /// SHA-256 hex digest of `bytes`.
208    pub sha256: String,
209    /// Tightly packed crop bytes.
210    pub bytes: Vec<u8>,
211}
212
213impl PdfiumBackend {
214    /// Construct a backend using an explicit PDFium dynamic library path.
215    pub fn from_library_path(path: impl Into<PathBuf>) -> Self {
216        PdfiumBackend {
217            library_path: Some(path.into()),
218            artifact_path: None,
219            version: None,
220        }
221    }
222
223    /// Add an explicit downloaded PDFium release artifact path for archive-hash verification.
224    pub fn with_artifact_path(mut self, path: impl Into<PathBuf>) -> Self {
225        self.artifact_path = Some(path.into());
226        self
227    }
228
229    /// Construct a backend using an explicit PDFium path and pinned version string.
230    pub fn with_version(mut self, version: impl Into<String>) -> Self {
231        self.version = Some(version.into());
232        self
233    }
234
235    fn configured_library_path(&self) -> Option<PathBuf> {
236        self.library_path
237            .clone()
238            .or_else(|| env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from))
239    }
240
241    fn configured_artifact_path(&self) -> Option<PathBuf> {
242        self.artifact_path
243            .clone()
244            .or_else(|| env::var_os(PDFIUM_ARTIFACT_PATH_ENV).map(PathBuf::from))
245    }
246
247    fn configured_version_override(&self) -> Option<String> {
248        self.version
249            .clone()
250            .or_else(|| env::var(PDFIUM_VERSION_ENV).ok())
251    }
252
253    fn configured_version(&self) -> String {
254        self.configured_version_override()
255            .unwrap_or_else(|| pinned_pdfium_profile().version.clone())
256    }
257
258    /// Probe whether the configured PDFium dynamic library can be loaded and initialized.
259    ///
260    /// This does not parse a document. It uses the same library load, symbol resolution, and
261    /// process-global PDFium init/destroy path as extraction, so callers should run it in a
262    /// disposable subprocess when probing operator-provided libraries.
263    pub fn probe_library(&self) -> Result<BackendManifest, EthosError> {
264        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
265        let runtime = PdfiumRuntime::load(self)?;
266        drop(runtime);
267        Ok(self.manifest())
268    }
269
270    /// Produce a debug-only geometry-source probe from PDFium text APIs.
271    ///
272    /// The returned data is diagnostic evidence only. It is intentionally
273    /// separate from [`EthosPdfBackend::extract`] so parser behavior,
274    /// canonical JSON, and document fingerprints cannot change by accident.
275    pub fn geometry_probe(
276        &self,
277        pdf_bytes: &[u8],
278        config: &ParseConfig,
279    ) -> Result<GeometryProbeReport, EthosError> {
280        validate_pdf_header(pdf_bytes)?;
281        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
282        let runtime = PdfiumRuntime::load(self)?;
283        let doc = runtime.load_document(pdf_bytes)?;
284        let page_count = doc.page_count()?;
285        if page_count > config.limits.max_pages {
286            return Err(EthosError::new(
287                ErrorCode::PageLimitExceeded,
288                "page count exceeds configured limit",
289            ));
290        }
291        validate_page_selection(&config.pages, page_count)?;
292
293        let mut pages = Vec::new();
294        for page_index in 0..page_count {
295            let original_page = page_index + 1;
296            if !config.pages.contains(original_page) {
297                continue;
298            }
299            let page = doc.load_page(page_index)?;
300            pages.push(page.geometry_probe_page(original_page)?);
301        }
302
303        Ok(GeometryProbeReport {
304            schema_version: "ethos-pdfium-geometry-probe-v1".to_string(),
305            quantum_per_point: QUANTUM_PER_POINT,
306            backend: self.manifest(),
307            pages,
308        })
309    }
310
311    /// Render a raw BGRA crop for a 1-based page and quantized top-left bbox.
312    ///
313    /// The current boundary renders the page at 1 pixel per PDF point, then
314    /// crops the requested bbox. It is intentionally simple; direct crop-window
315    /// rendering can replace it later without changing the output contract.
316    pub fn render_crop_raw(
317        &self,
318        pdf_bytes: &[u8],
319        page_index: u32,
320        bbox: QRect,
321    ) -> Result<RawCrop, EthosError> {
322        validate_pdf_header(pdf_bytes)?;
323        if page_index == 0 {
324            return Err(EthosError::new(
325                ErrorCode::PageLimitExceeded,
326                "page selection out of document range",
327            ));
328        }
329        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
330        let runtime = PdfiumRuntime::load(self)?;
331        let doc = runtime.load_document(pdf_bytes)?;
332        let page_count = doc.page_count()?;
333        if page_index > page_count {
334            return Err(EthosError::new(
335                ErrorCode::PageLimitExceeded,
336                "page selection out of document range",
337            ));
338        }
339        let page = doc.load_page(page_index - 1)?;
340        page.render_crop_raw(page_index, bbox)
341    }
342}
343
344impl EthosPdfBackend for PdfiumBackend {
345    fn manifest(&self) -> BackendManifest {
346        let platform_sha256 = self
347            .configured_library_path()
348            .and_then(|path| std::fs::read(path).ok())
349            .map(|bytes| ethos_core::c14n::sha256_hex_bytes(&bytes))
350            .unwrap_or_else(|| "0".repeat(64));
351        BackendManifest {
352            id: "pdfium".to_string(),
353            phase: 1,
354            version: self.configured_version(),
355            platform_sha256,
356        }
357    }
358
359    fn page_count(&self, pdf_bytes: &[u8]) -> Result<u32, EthosError> {
360        validate_pdf_header(pdf_bytes)?;
361        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
362        let runtime = PdfiumRuntime::load(self)?;
363        let doc = runtime.load_document(pdf_bytes)?;
364        doc.page_count()
365    }
366
367    fn extract(&self, pdf_bytes: &[u8], config: &ParseConfig) -> Result<Extraction, EthosError> {
368        validate_pdf_header(pdf_bytes)?;
369        let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
370        let runtime = PdfiumRuntime::load(self)?;
371        let doc = runtime.load_document(pdf_bytes)?;
372        let page_count = doc.page_count()?;
373        if page_count > config.limits.max_pages {
374            return Err(EthosError::new(
375                ErrorCode::PageLimitExceeded,
376                "page count exceeds configured limit",
377            ));
378        }
379        validate_page_selection(&config.pages, page_count)?;
380
381        let mut pages = Vec::new();
382        let mut spans = Vec::new();
383        let mut warnings = Vec::new();
384        let mut next_span = 1u32;
385        let mut next_warning = 1u32;
386
387        for page_index in 0..page_count {
388            let original_page = page_index + 1;
389            if !config.pages.contains(original_page) {
390                continue;
391            }
392            let page = doc.load_page(page_index)?;
393            let page_model = page.model_page(original_page)?;
394            let span_count_before = spans.len();
395            page.extract_text_spans(&page_model, &mut next_span, &mut spans)?;
396            if spans.len() == span_count_before {
397                warnings.push(Warning {
398                    id: warning_id(next_warning)?,
399                    code: WarningCode::ImageOnlyPage,
400                    message: "page has no extractable text; OCR is required for this page"
401                        .to_string(),
402                    page: Some(page_model.id.clone()),
403                    element_ref: None,
404                    span_ref: None,
405                    region_ref: None,
406                });
407                next_warning += 1;
408            }
409            pages.push(page_model);
410        }
411
412        if spans.is_empty() {
413            return Err(EthosError::new(
414                ErrorCode::OcrRequired,
415                "no extractable text; OCR is required",
416            ));
417        }
418
419        Ok(Extraction {
420            pages,
421            spans,
422            regions: Vec::new(),
423            warnings,
424        })
425    }
426}
427
428fn validate_page_selection(selection: &PageSelection, page_count: u32) -> Result<(), EthosError> {
429    selection.validate_against(page_count).map_err(|_| {
430        EthosError::new(
431            ErrorCode::PageLimitExceeded,
432            "page selection out of document range",
433        )
434    })
435}
436
437fn validate_pdf_header(pdf_bytes: &[u8]) -> Result<(), EthosError> {
438    let window = &pdf_bytes[..pdf_bytes.len().min(1024)];
439    if window.windows(5).any(|w| w == b"%PDF-") {
440        Ok(())
441    } else {
442        Err(EthosError::new(
443            ErrorCode::InvalidPdf,
444            "input does not contain a PDF header",
445        ))
446    }
447}
448
449fn quantize_coord(value: f64) -> Result<i64, EthosError> {
450    quantize(value, QUANTUM_PER_POINT)
451        .map_err(|_| EthosError::new(ErrorCode::InternalError, "coordinate quantization failed"))
452}
453
454fn pixel_extent(points: f64) -> Result<u32, EthosError> {
455    if !points.is_finite() || points <= 0.0 {
456        return Err(EthosError::new(
457            ErrorCode::CorruptPdf,
458            "PDF page has invalid dimensions",
459        ));
460    }
461    if points.ceil() > f64::from(c_int::MAX) {
462        return Err(EthosError::internal("render bitmap dimension overflow"));
463    }
464    Ok(points.ceil() as u32)
465}
466
467fn floor_quantized_pixel(value: i64) -> i64 {
468    value.div_euclid(i64::from(QUANTUM_PER_POINT))
469}
470
471fn ceil_quantized_pixel(value: i64) -> i64 {
472    let quantum = i64::from(QUANTUM_PER_POINT);
473    value
474        .checked_add(quantum - 1)
475        .unwrap_or(i64::MAX)
476        .div_euclid(quantum)
477}
478
479fn clamp_pixel(value: i64, max: u32) -> u32 {
480    value.clamp(0, i64::from(max)) as u32
481}
482
483fn crop_window(
484    bbox: QRect,
485    page_width_px: u32,
486    page_height_px: u32,
487) -> Result<(u32, u32, u32, u32), EthosError> {
488    let x0 = clamp_pixel(floor_quantized_pixel(bbox.x0), page_width_px);
489    let y0 = clamp_pixel(floor_quantized_pixel(bbox.y0), page_height_px);
490    let x1 = clamp_pixel(ceil_quantized_pixel(bbox.x1), page_width_px);
491    let y1 = clamp_pixel(ceil_quantized_pixel(bbox.y1), page_height_px);
492    if x0 >= x1 || y0 >= y1 {
493        return Err(EthosError::internal(
494            "crop bbox has no positive pixel extent",
495        ));
496    }
497    Ok((x0, y0, x1 - x0, y1 - y0))
498}
499
500fn qrect_from_pdfium_char_box(
501    page_height_pts: f64,
502    left: f64,
503    right: f64,
504    bottom: f64,
505    top: f64,
506) -> Result<QRect, EthosError> {
507    let x0 = left.min(right);
508    let x1 = left.max(right);
509    let y0 = page_height_pts - top.max(bottom);
510    let y1 = page_height_pts - top.min(bottom);
511    QRect::new(
512        quantize_coord(x0)?,
513        quantize_coord(y0)?,
514        quantize_coord(x1)?,
515        quantize_coord(y1)?,
516    )
517    .map_err(|_| EthosError::internal("malformed character bbox"))
518}
519
520fn union_rect(a: QRect, b: QRect) -> QRect {
521    QRect {
522        x0: a.x0.min(b.x0),
523        y0: a.y0.min(b.y0),
524        x1: a.x1.max(b.x1),
525        y1: a.y1.max(b.y1),
526    }
527}
528
529fn map_pdfium_error(code: c_ulong) -> EthosError {
530    match code {
531        4 => EthosError::new(
532            ErrorCode::PasswordProtected,
533            "document is encrypted or password-protected",
534        ),
535        5 => EthosError::new(
536            ErrorCode::UnsupportedPdfFeature,
537            "document uses a restricted security handler",
538        ),
539        3 => EthosError::new(ErrorCode::CorruptPdf, "PDF structure is corrupt"),
540        6 => EthosError::new(ErrorCode::CorruptPdf, "PDF page tree is corrupt"),
541        2 => EthosError::new(ErrorCode::CorruptPdf, "PDF could not be loaded"),
542        _ => EthosError::new(ErrorCode::CorruptPdf, "PDFium could not load the document"),
543    }
544}
545
546#[derive(Debug, Deserialize)]
547struct DeterministicProfile {
548    backend: PinnedPdfiumBackend,
549}
550
551#[derive(Debug, Deserialize)]
552struct PinnedPdfiumBackend {
553    id: String,
554    phase: u8,
555    version: String,
556    upstream_version: String,
557    v8: String,
558    xfa: String,
559    distribution: PinnedPdfiumDistribution,
560    build_flags: PinnedPdfiumBuildFlags,
561    platform_hashes: BTreeMap<String, String>,
562    platform_artifacts: BTreeMap<String, PinnedPdfiumArtifact>,
563    profile_doc: String,
564}
565
566#[derive(Debug, Deserialize)]
567struct PinnedPdfiumDistribution {
568    source: String,
569    release_url: String,
570    published_at: String,
571    attestation: PinnedPdfiumAttestation,
572}
573
574#[derive(Debug, Deserialize)]
575struct PinnedPdfiumAttestation {
576    name: String,
577    sha256: String,
578}
579
580#[derive(Debug, Deserialize)]
581struct PinnedPdfiumBuildFlags {
582    is_component_build: bool,
583    is_debug: bool,
584    pdf_enable_v8: bool,
585    pdf_enable_xfa: bool,
586    pdf_is_standalone: bool,
587    pdf_use_partition_alloc: bool,
588}
589
590#[derive(Debug, Deserialize)]
591struct PinnedPdfiumArtifact {
592    name: String,
593    target_os: String,
594    target_cpu: String,
595    runtime_library_path: String,
596    runtime_library_sha256: String,
597}
598
599fn pinned_pdfium_profile() -> &'static PinnedPdfiumBackend {
600    PINNED_PDFIUM_PROFILE.get_or_init(|| {
601        let profile: DeterministicProfile = serde_json::from_str(DETERMINISTIC_PROFILE_JSON)
602            .expect("profiles/ethos-deterministic-v1.json is valid JSON");
603        validate_pinned_pdfium_profile(&profile.backend)
604            .expect("profiles/ethos-deterministic-v1.json pins a valid PDFium Phase 1 profile");
605        profile.backend
606    })
607}
608
609fn validate_pinned_pdfium_profile(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
610    validate_pinned_pdfium_identity(profile)?;
611    validate_pinned_pdfium_distribution(&profile.distribution)?;
612    validate_pinned_pdfium_build_flags(&profile.build_flags)?;
613    validate_pinned_pdfium_platforms(profile)?;
614    Ok(())
615}
616
617fn validate_pinned_pdfium_identity(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
618    if profile.id != "pdfium"
619        || profile.phase != 1
620        || profile.version != "chromium/7881"
621        || profile.upstream_version != "PDFium 151.0.7881.0"
622        || profile.v8 != "disabled"
623        || profile.xfa != "disabled"
624        || profile.profile_doc != "docs/pdfium-profile.md"
625    {
626        return Err("unexpected PDFium profile identity");
627    }
628    Ok(())
629}
630
631fn validate_pinned_pdfium_distribution(
632    distribution: &PinnedPdfiumDistribution,
633) -> Result<(), &'static str> {
634    if distribution.source != "bblanchon/pdfium-binaries"
635        || distribution.attestation.name != "pdfium-attestation.json"
636        || !is_sha256_hex(&distribution.attestation.sha256)
637        || !distribution
638            .release_url
639            .starts_with("https://github.com/bblanchon/pdfium-binaries/releases/tag/")
640        || !distribution.published_at.ends_with('Z')
641    {
642        return Err("unexpected PDFium distribution metadata");
643    }
644    Ok(())
645}
646
647fn validate_pinned_pdfium_build_flags(
648    build_flags: &PinnedPdfiumBuildFlags,
649) -> Result<(), &'static str> {
650    if build_flags.is_component_build
651        || build_flags.is_debug
652        || build_flags.pdf_enable_v8
653        || build_flags.pdf_enable_xfa
654        || !build_flags.pdf_is_standalone
655        || build_flags.pdf_use_partition_alloc
656    {
657        return Err("PDFium Phase 1 must be standalone release with V8/XFA disabled");
658    }
659    Ok(())
660}
661
662fn validate_pinned_pdfium_platforms(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
663    for platform in ["macos-arm64", "linux-x64", "windows-x64"] {
664        let artifact_hash = profile
665            .platform_hashes
666            .get(platform)
667            .ok_or("missing PDFium artifact hash")?;
668        if !is_sha256_hex(artifact_hash) {
669            return Err("malformed PDFium artifact hash");
670        }
671        let artifact = profile
672            .platform_artifacts
673            .get(platform)
674            .ok_or("missing PDFium platform artifact metadata")?;
675        if artifact.name.contains("-v8-")
676            || artifact.name.contains("xfa")
677            || !artifact.name.ends_with(".tgz")
678            || artifact.runtime_library_path.is_empty()
679            || !is_sha256_hex(&artifact.runtime_library_sha256)
680        {
681            return Err("malformed PDFium platform artifact metadata");
682        }
683        match platform {
684            "macos-arm64"
685                if artifact.name == "pdfium-mac-arm64.tgz"
686                    && artifact.target_os == "mac"
687                    && artifact.target_cpu == "arm64" => {}
688            "linux-x64"
689                if artifact.name == "pdfium-linux-x64.tgz"
690                    && artifact.target_os == "linux"
691                    && artifact.target_cpu == "x64" => {}
692            "windows-x64"
693                if artifact.name == "pdfium-win-x64.tgz"
694                    && artifact.target_os == "win"
695                    && artifact.target_cpu == "x64" => {}
696            _ => return Err("unexpected PDFium platform artifact"),
697        }
698    }
699    Ok(())
700}
701
702fn is_sha256_hex(value: &str) -> bool {
703    value.len() == 64
704        && value
705            .bytes()
706            .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase())
707}
708
709fn current_platform_key() -> Option<&'static str> {
710    if cfg!(all(target_os = "macos", target_arch = "aarch64")) {
711        Some("macos-arm64")
712    } else if cfg!(all(target_os = "linux", target_arch = "x86_64")) {
713        Some("linux-x64")
714    } else if cfg!(all(target_os = "windows", target_arch = "x86_64")) {
715        Some("windows-x64")
716    } else {
717        None
718    }
719}
720
721fn current_pdfium_pins(
722    profile: &PinnedPdfiumBackend,
723) -> Result<(&'static str, &str, &PinnedPdfiumArtifact), EthosError> {
724    let platform = current_platform_key().ok_or_else(|| {
725        EthosError::internal("pdfium phase 1 profile has no hash for this platform")
726    })?;
727    let artifact_hash = profile.platform_hashes.get(platform).ok_or_else(|| {
728        EthosError::internal("pdfium phase 1 profile has no hash for this platform")
729    })?;
730    let artifact = profile.platform_artifacts.get(platform).ok_or_else(|| {
731        EthosError::internal("pdfium phase 1 profile has no artifact for this platform")
732    })?;
733    Ok((platform, artifact_hash.as_str(), artifact))
734}
735
736fn validate_pinned_pdfium_payload(
737    backend: &PdfiumBackend,
738    library_path: &Path,
739) -> Result<(), EthosError> {
740    let profile = pinned_pdfium_profile();
741    if let Some(version) = backend.configured_version_override() {
742        let upstream_number = profile
743            .upstream_version
744            .strip_prefix("PDFium ")
745            .unwrap_or(&profile.upstream_version);
746        if version != profile.version
747            && version != profile.upstream_version
748            && version != upstream_number
749        {
750            return Err(EthosError::internal(
751                "pdfium version does not match pinned phase 1 profile",
752            ));
753        }
754    }
755
756    let (_, artifact_hash, artifact) = current_pdfium_pins(profile)?;
757    if let Some(artifact_path) = backend.configured_artifact_path() {
758        if !artifact_path.is_file() {
759            return Err(EthosError::internal(
760                "pdfium artifact path does not point to a file",
761            ));
762        }
763        let actual_artifact_hash = sha256_file(&artifact_path)?;
764        if actual_artifact_hash != artifact_hash {
765            return Err(EthosError::internal(
766                "pdfium artifact does not match pinned phase 1 profile",
767            ));
768        }
769    }
770
771    let library_hash = sha256_file(library_path)?;
772    if library_hash != artifact.runtime_library_sha256 {
773        return Err(EthosError::internal(
774            "pdfium library does not match pinned phase 1 profile",
775        ));
776    }
777
778    Ok(())
779}
780
781fn sha256_file(path: &Path) -> Result<String, EthosError> {
782    let bytes =
783        std::fs::read(path).map_err(|_| EthosError::internal("failed to read pdfium payload"))?;
784    Ok(ethos_core::c14n::sha256_hex_bytes(&bytes))
785}
786
787type FpdfDocument = *mut c_void;
788type FpdfPage = *mut c_void;
789type FpdfTextPage = *mut c_void;
790type FpdfBitmap = *mut c_void;
791
792#[cfg(not(windows))]
793type FpdfInitLibrary = unsafe extern "C" fn();
794#[cfg(windows)]
795type FpdfInitLibrary = unsafe extern "system" fn();
796#[cfg(not(windows))]
797type FpdfDestroyLibrary = unsafe extern "C" fn();
798#[cfg(windows)]
799type FpdfDestroyLibrary = unsafe extern "system" fn();
800#[cfg(not(windows))]
801type FpdfLoadMemDocument64 =
802    unsafe extern "C" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
803#[cfg(windows)]
804type FpdfLoadMemDocument64 =
805    unsafe extern "system" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
806#[cfg(not(windows))]
807type FpdfCloseDocument = unsafe extern "C" fn(FpdfDocument);
808#[cfg(windows)]
809type FpdfCloseDocument = unsafe extern "system" fn(FpdfDocument);
810#[cfg(not(windows))]
811type FpdfGetLastError = unsafe extern "C" fn() -> c_ulong;
812#[cfg(windows)]
813type FpdfGetLastError = unsafe extern "system" fn() -> c_ulong;
814#[cfg(not(windows))]
815type FpdfGetPageCount = unsafe extern "C" fn(FpdfDocument) -> c_int;
816#[cfg(windows)]
817type FpdfGetPageCount = unsafe extern "system" fn(FpdfDocument) -> c_int;
818#[cfg(not(windows))]
819type FpdfLoadPage = unsafe extern "C" fn(FpdfDocument, c_int) -> FpdfPage;
820#[cfg(windows)]
821type FpdfLoadPage = unsafe extern "system" fn(FpdfDocument, c_int) -> FpdfPage;
822#[cfg(not(windows))]
823type FpdfClosePage = unsafe extern "C" fn(FpdfPage);
824#[cfg(windows)]
825type FpdfClosePage = unsafe extern "system" fn(FpdfPage);
826#[cfg(not(windows))]
827type FpdfGetPageWidthF = unsafe extern "C" fn(FpdfPage) -> f32;
828#[cfg(windows)]
829type FpdfGetPageWidthF = unsafe extern "system" fn(FpdfPage) -> f32;
830#[cfg(not(windows))]
831type FpdfGetPageHeightF = unsafe extern "C" fn(FpdfPage) -> f32;
832#[cfg(windows)]
833type FpdfGetPageHeightF = unsafe extern "system" fn(FpdfPage) -> f32;
834#[cfg(not(windows))]
835type FpdfPageGetRotation = unsafe extern "C" fn(FpdfPage) -> c_int;
836#[cfg(windows)]
837type FpdfPageGetRotation = unsafe extern "system" fn(FpdfPage) -> c_int;
838#[cfg(not(windows))]
839type FpdfTextLoadPage = unsafe extern "C" fn(FpdfPage) -> FpdfTextPage;
840#[cfg(windows)]
841type FpdfTextLoadPage = unsafe extern "system" fn(FpdfPage) -> FpdfTextPage;
842#[cfg(not(windows))]
843type FpdfTextClosePage = unsafe extern "C" fn(FpdfTextPage);
844#[cfg(windows)]
845type FpdfTextClosePage = unsafe extern "system" fn(FpdfTextPage);
846#[cfg(not(windows))]
847type FpdfTextCountChars = unsafe extern "C" fn(FpdfTextPage) -> c_int;
848#[cfg(windows)]
849type FpdfTextCountChars = unsafe extern "system" fn(FpdfTextPage) -> c_int;
850#[cfg(not(windows))]
851type FpdfTextGetUnicode = unsafe extern "C" fn(FpdfTextPage, c_int) -> u32;
852#[cfg(windows)]
853type FpdfTextGetUnicode = unsafe extern "system" fn(FpdfTextPage, c_int) -> u32;
854#[cfg(not(windows))]
855type FpdfTextGetCharBox =
856    unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
857#[cfg(windows)]
858type FpdfTextGetCharBox =
859    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
860#[cfg(not(windows))]
861type FpdfTextGetLooseCharBox = unsafe extern "C" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
862#[cfg(windows)]
863type FpdfTextGetLooseCharBox =
864    unsafe extern "system" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
865#[cfg(not(windows))]
866type FpdfTextGetCharOrigin = unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
867#[cfg(windows)]
868type FpdfTextGetCharOrigin =
869    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
870#[cfg(not(windows))]
871type FpdfTextCountRects = unsafe extern "C" fn(FpdfTextPage, c_int, c_int) -> c_int;
872#[cfg(windows)]
873type FpdfTextCountRects = unsafe extern "system" fn(FpdfTextPage, c_int, c_int) -> c_int;
874#[cfg(not(windows))]
875type FpdfTextGetRect =
876    unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
877#[cfg(windows)]
878type FpdfTextGetRect =
879    unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
880#[cfg(not(windows))]
881type FpdfTextGetFontSize = unsafe extern "C" fn(FpdfTextPage, c_int) -> f64;
882#[cfg(windows)]
883type FpdfTextGetFontSize = unsafe extern "system" fn(FpdfTextPage, c_int) -> f64;
884#[cfg(not(windows))]
885type FpdfTextGetFontInfo =
886    unsafe extern "C" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
887#[cfg(windows)]
888type FpdfTextGetFontInfo =
889    unsafe extern "system" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
890#[cfg(not(windows))]
891type FpdfTextIsGenerated = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
892#[cfg(windows)]
893type FpdfTextIsGenerated = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
894#[cfg(not(windows))]
895type FpdfTextIsHyphen = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
896#[cfg(windows)]
897type FpdfTextIsHyphen = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
898#[cfg(not(windows))]
899type FpdfBitmapCreate = unsafe extern "C" fn(c_int, c_int, c_int) -> FpdfBitmap;
900#[cfg(windows)]
901type FpdfBitmapCreate = unsafe extern "system" fn(c_int, c_int, c_int) -> FpdfBitmap;
902#[cfg(not(windows))]
903type FpdfBitmapDestroy = unsafe extern "C" fn(FpdfBitmap);
904#[cfg(windows)]
905type FpdfBitmapDestroy = unsafe extern "system" fn(FpdfBitmap);
906#[cfg(not(windows))]
907type FpdfBitmapFillRect = unsafe extern "C" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
908#[cfg(windows)]
909type FpdfBitmapFillRect =
910    unsafe extern "system" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
911#[cfg(not(windows))]
912type FpdfBitmapGetBuffer = unsafe extern "C" fn(FpdfBitmap) -> *mut c_void;
913#[cfg(windows)]
914type FpdfBitmapGetBuffer = unsafe extern "system" fn(FpdfBitmap) -> *mut c_void;
915#[cfg(not(windows))]
916type FpdfBitmapGetStride = unsafe extern "C" fn(FpdfBitmap) -> c_int;
917#[cfg(windows)]
918type FpdfBitmapGetStride = unsafe extern "system" fn(FpdfBitmap) -> c_int;
919#[cfg(not(windows))]
920type FpdfRenderPageBitmap =
921    unsafe extern "C" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
922#[cfg(windows)]
923type FpdfRenderPageBitmap =
924    unsafe extern "system" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
925
926#[repr(C)]
927#[derive(Clone, Copy, Debug, Default)]
928struct FsRectF {
929    left: f32,
930    top: f32,
931    right: f32,
932    bottom: f32,
933}
934
935#[derive(Clone, Copy)]
936struct PdfiumFunctions {
937    init_library: FpdfInitLibrary,
938    destroy_library: FpdfDestroyLibrary,
939    load_mem_document64: FpdfLoadMemDocument64,
940    close_document: FpdfCloseDocument,
941    get_last_error: FpdfGetLastError,
942    get_page_count: FpdfGetPageCount,
943    load_page: FpdfLoadPage,
944    close_page: FpdfClosePage,
945    get_page_width_f: FpdfGetPageWidthF,
946    get_page_height_f: FpdfGetPageHeightF,
947    page_get_rotation: Option<FpdfPageGetRotation>,
948    text_load_page: FpdfTextLoadPage,
949    text_close_page: FpdfTextClosePage,
950    text_count_chars: FpdfTextCountChars,
951    text_get_unicode: FpdfTextGetUnicode,
952    text_get_char_box: FpdfTextGetCharBox,
953    text_get_loose_char_box: Option<FpdfTextGetLooseCharBox>,
954    text_get_char_origin: Option<FpdfTextGetCharOrigin>,
955    text_count_rects: Option<FpdfTextCountRects>,
956    text_get_rect: Option<FpdfTextGetRect>,
957    text_get_font_size: FpdfTextGetFontSize,
958    text_get_font_info: Option<FpdfTextGetFontInfo>,
959    text_is_generated: Option<FpdfTextIsGenerated>,
960    text_is_hyphen: Option<FpdfTextIsHyphen>,
961    bitmap_create: Option<FpdfBitmapCreate>,
962    bitmap_destroy: Option<FpdfBitmapDestroy>,
963    bitmap_fill_rect: Option<FpdfBitmapFillRect>,
964    bitmap_get_buffer: Option<FpdfBitmapGetBuffer>,
965    bitmap_get_stride: Option<FpdfBitmapGetStride>,
966    render_page_bitmap: Option<FpdfRenderPageBitmap>,
967}
968
969impl PdfiumFunctions {
970    fn load(library: &dylib::Library) -> Result<Self, EthosError> {
971        // SAFETY: symbols are loaded from the configured PDFium dynamic library and
972        // immediately copied into typed function pointers matching the C API.
973        unsafe {
974            Ok(PdfiumFunctions {
975                init_library: library.symbol(b"FPDF_InitLibrary\0")?,
976                destroy_library: library.symbol(b"FPDF_DestroyLibrary\0")?,
977                load_mem_document64: library.symbol(b"FPDF_LoadMemDocument64\0")?,
978                close_document: library.symbol(b"FPDF_CloseDocument\0")?,
979                get_last_error: library.symbol(b"FPDF_GetLastError\0")?,
980                get_page_count: library.symbol(b"FPDF_GetPageCount\0")?,
981                load_page: library.symbol(b"FPDF_LoadPage\0")?,
982                close_page: library.symbol(b"FPDF_ClosePage\0")?,
983                get_page_width_f: library.symbol(b"FPDF_GetPageWidthF\0")?,
984                get_page_height_f: library.symbol(b"FPDF_GetPageHeightF\0")?,
985                page_get_rotation: library.optional_symbol(b"FPDFPage_GetRotation\0"),
986                text_load_page: library.symbol(b"FPDFText_LoadPage\0")?,
987                text_close_page: library.symbol(b"FPDFText_ClosePage\0")?,
988                text_count_chars: library.symbol(b"FPDFText_CountChars\0")?,
989                text_get_unicode: library.symbol(b"FPDFText_GetUnicode\0")?,
990                text_get_char_box: library.symbol(b"FPDFText_GetCharBox\0")?,
991                text_get_loose_char_box: library.optional_symbol(b"FPDFText_GetLooseCharBox\0"),
992                text_get_char_origin: library.optional_symbol(b"FPDFText_GetCharOrigin\0"),
993                text_count_rects: library.optional_symbol(b"FPDFText_CountRects\0"),
994                text_get_rect: library.optional_symbol(b"FPDFText_GetRect\0"),
995                text_get_font_size: library.symbol(b"FPDFText_GetFontSize\0")?,
996                text_get_font_info: library.optional_symbol(b"FPDFText_GetFontInfo\0"),
997                text_is_generated: library.optional_symbol(b"FPDFText_IsGenerated\0"),
998                text_is_hyphen: library.optional_symbol(b"FPDFText_IsHyphen\0"),
999                bitmap_create: library.optional_symbol(b"FPDFBitmap_Create\0"),
1000                bitmap_destroy: library.optional_symbol(b"FPDFBitmap_Destroy\0"),
1001                bitmap_fill_rect: library.optional_symbol(b"FPDFBitmap_FillRect\0"),
1002                bitmap_get_buffer: library.optional_symbol(b"FPDFBitmap_GetBuffer\0"),
1003                bitmap_get_stride: library.optional_symbol(b"FPDFBitmap_GetStride\0"),
1004                render_page_bitmap: library.optional_symbol(b"FPDF_RenderPageBitmap\0"),
1005            })
1006        }
1007    }
1008
1009    fn geometry_probe_symbols(self) -> GeometryProbeSymbols {
1010        GeometryProbeSymbols {
1011            char_origin: self.text_get_char_origin.is_some(),
1012            loose_char_box: self.text_get_loose_char_box.is_some(),
1013            text_rects: self.text_count_rects.is_some() && self.text_get_rect.is_some(),
1014        }
1015    }
1016}
1017
1018struct PdfiumRuntime {
1019    _library: dylib::Library,
1020    funcs: PdfiumFunctions,
1021    initialized: bool,
1022}
1023
1024impl PdfiumRuntime {
1025    fn load(backend: &PdfiumBackend) -> Result<Self, EthosError> {
1026        let path = backend.configured_library_path().ok_or_else(|| {
1027            EthosError::internal(format!(
1028                "PDFium not found: set {PDFIUM_LIBRARY_PATH_ENV} to the caller-provided PDFium dynamic library path. {PDFIUM_SETUP_GUIDANCE}"
1029            ))
1030        })?;
1031        if !path.is_file() {
1032            return Err(EthosError::internal(format!(
1033                "pdfium library path does not point to a file. {PDFIUM_SETUP_GUIDANCE}"
1034            )));
1035        }
1036        validate_pinned_pdfium_payload(backend, &path)?;
1037
1038        let library = dylib::Library::open(&path)?;
1039        let funcs = PdfiumFunctions::load(&library)?;
1040        // SAFETY: FPDF_InitLibrary initializes process-global PDFium state. Calls are
1041        // serialized by PDFIUM_LOCK.
1042        unsafe { (funcs.init_library)() };
1043        Ok(PdfiumRuntime {
1044            _library: library,
1045            funcs,
1046            initialized: true,
1047        })
1048    }
1049
1050    fn load_document<'a>(&'a self, pdf_bytes: &[u8]) -> Result<PdfDocument<'a>, EthosError> {
1051        // SAFETY: PDFium reads the immutable byte slice only for the duration of
1052        // FPDF_LoadMemDocument64. The returned document is closed by PdfDocument::drop.
1053        let handle = unsafe {
1054            (self.funcs.load_mem_document64)(
1055                pdf_bytes.as_ptr().cast(),
1056                pdf_bytes.len(),
1057                ptr::null(),
1058            )
1059        };
1060        if handle.is_null() {
1061            // SAFETY: FPDF_GetLastError has no preconditions after a failed load.
1062            let code = unsafe { (self.funcs.get_last_error)() };
1063            Err(map_pdfium_error(code))
1064        } else {
1065            Ok(PdfDocument {
1066                funcs: &self.funcs,
1067                handle,
1068            })
1069        }
1070    }
1071}
1072
1073impl Drop for PdfiumRuntime {
1074    fn drop(&mut self) {
1075        if self.initialized {
1076            // SAFETY: paired with FPDF_InitLibrary above and serialized by PDFIUM_LOCK.
1077            unsafe { (self.funcs.destroy_library)() };
1078        }
1079    }
1080}
1081
1082struct PdfDocument<'a> {
1083    funcs: &'a PdfiumFunctions,
1084    handle: FpdfDocument,
1085}
1086
1087impl PdfDocument<'_> {
1088    fn page_count(&self) -> Result<u32, EthosError> {
1089        // SAFETY: handle is a live FPDF_DOCUMENT owned by self.
1090        let count = unsafe { (self.funcs.get_page_count)(self.handle) };
1091        if count <= 0 {
1092            return Err(EthosError::new(
1093                ErrorCode::CorruptPdf,
1094                "PDF has no readable pages",
1095            ));
1096        }
1097        u32::try_from(count).map_err(|_| EthosError::internal("page count overflow"))
1098    }
1099
1100    fn load_page(&self, page_index: u32) -> Result<PdfPage<'_>, EthosError> {
1101        let index =
1102            c_int::try_from(page_index).map_err(|_| EthosError::internal("page overflow"))?;
1103        // SAFETY: handle is live and index has been bounded by the caller.
1104        let handle = unsafe { (self.funcs.load_page)(self.handle, index) };
1105        if handle.is_null() {
1106            // SAFETY: FPDF_GetLastError has no preconditions after a failed page load.
1107            let code = unsafe { (self.funcs.get_last_error)() };
1108            Err(map_pdfium_error(code))
1109        } else {
1110            Ok(PdfPage {
1111                funcs: self.funcs,
1112                handle,
1113            })
1114        }
1115    }
1116}
1117
1118impl Drop for PdfDocument<'_> {
1119    fn drop(&mut self) {
1120        // SAFETY: handle is a live FPDF_DOCUMENT and is closed exactly once here.
1121        unsafe { (self.funcs.close_document)(self.handle) };
1122    }
1123}
1124
1125struct PdfPage<'a> {
1126    funcs: &'a PdfiumFunctions,
1127    handle: FpdfPage,
1128}
1129
1130impl PdfPage<'_> {
1131    fn width_pts(&self) -> f64 {
1132        // SAFETY: handle is a live FPDF_PAGE.
1133        unsafe { (self.funcs.get_page_width_f)(self.handle) as f64 }
1134    }
1135
1136    fn height_pts(&self) -> f64 {
1137        // SAFETY: handle is a live FPDF_PAGE.
1138        unsafe { (self.funcs.get_page_height_f)(self.handle) as f64 }
1139    }
1140
1141    fn rotation(&self) -> u16 {
1142        let Some(get_rotation) = self.funcs.page_get_rotation else {
1143            return 0;
1144        };
1145        // SAFETY: handle is a live FPDF_PAGE.
1146        match unsafe { get_rotation(self.handle) }.rem_euclid(4) {
1147            1 => 90,
1148            2 => 180,
1149            3 => 270,
1150            _ => 0,
1151        }
1152    }
1153
1154    fn model_page(&self, original_page: u32) -> Result<Page, EthosError> {
1155        Ok(Page {
1156            id: page_id(original_page)?,
1157            index: original_page,
1158            width: quantize_coord(self.width_pts())?,
1159            height: quantize_coord(self.height_pts())?,
1160            rotation: self.rotation(),
1161        })
1162    }
1163
1164    fn geometry_probe_page(&self, original_page: u32) -> Result<GeometryProbePage, EthosError> {
1165        let page = self.model_page(original_page)?;
1166        // SAFETY: handle is a live FPDF_PAGE. Text page is closed by PdfTextPage::drop.
1167        let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1168        if text_handle.is_null() {
1169            return Ok(GeometryProbePage {
1170                id: page.id,
1171                index: page.index,
1172                width: page.width,
1173                height: page.height,
1174                rotation: page.rotation,
1175                char_count: 0,
1176                symbols: self.funcs.geometry_probe_symbols(),
1177                chars: Vec::new(),
1178                runs: Vec::new(),
1179            });
1180        }
1181        let text_page = PdfTextPage {
1182            funcs: self.funcs,
1183            handle: text_handle,
1184        };
1185        text_page.geometry_probe(&page, self.height_pts())
1186    }
1187
1188    fn extract_text_spans(
1189        &self,
1190        page: &Page,
1191        next_span: &mut u32,
1192        spans: &mut Vec<Span>,
1193    ) -> Result<(), EthosError> {
1194        // SAFETY: handle is a live FPDF_PAGE. Text page is closed by PdfTextPage::drop.
1195        let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1196        if text_handle.is_null() {
1197            return Ok(());
1198        }
1199        let text_page = PdfTextPage {
1200            funcs: self.funcs,
1201            handle: text_handle,
1202        };
1203        text_page.extract_runs(page, self.height_pts(), next_span, spans)
1204    }
1205
1206    fn render_crop_raw(&self, page_index: u32, bbox: QRect) -> Result<RawCrop, EthosError> {
1207        let bitmap = RenderBitmap::render_page(
1208            self.funcs,
1209            self.handle,
1210            pixel_extent(self.width_pts())?,
1211            pixel_extent(self.height_pts())?,
1212        )?;
1213        let (x0, y0, width_px, height_px) = crop_window(bbox, bitmap.width_px, bitmap.height_px)?;
1214        let bytes = bitmap.crop_bytes(x0, y0, width_px, height_px)?;
1215        Ok(RawCrop {
1216            page_index,
1217            bbox,
1218            width_px,
1219            height_px,
1220            stride: width_px
1221                .checked_mul(4)
1222                .ok_or_else(|| EthosError::internal("crop stride overflow"))?,
1223            pixel_format: "bgra_8u",
1224            sha256: ethos_core::c14n::sha256_hex_bytes(&bytes),
1225            bytes,
1226        })
1227    }
1228}
1229
1230impl Drop for PdfPage<'_> {
1231    fn drop(&mut self) {
1232        // SAFETY: handle is a live FPDF_PAGE and is closed exactly once here.
1233        unsafe { (self.funcs.close_page)(self.handle) };
1234    }
1235}
1236
1237struct PdfTextPage<'a> {
1238    funcs: &'a PdfiumFunctions,
1239    handle: FpdfTextPage,
1240}
1241
1242struct RenderBitmap<'a> {
1243    funcs: &'a PdfiumFunctions,
1244    handle: FpdfBitmap,
1245    width_px: u32,
1246    height_px: u32,
1247    stride: usize,
1248}
1249
1250impl RenderBitmap<'_> {
1251    fn render_page(
1252        funcs: &PdfiumFunctions,
1253        page: FpdfPage,
1254        width_px: u32,
1255        height_px: u32,
1256    ) -> Result<RenderBitmap<'_>, EthosError> {
1257        let Some(bitmap_create) = funcs.bitmap_create else {
1258            return Err(EthosError::internal(
1259                "pdfium library is missing bitmap render symbols",
1260            ));
1261        };
1262        let Some(bitmap_fill_rect) = funcs.bitmap_fill_rect else {
1263            return Err(EthosError::internal(
1264                "pdfium library is missing bitmap render symbols",
1265            ));
1266        };
1267        let Some(render_page_bitmap) = funcs.render_page_bitmap else {
1268            return Err(EthosError::internal(
1269                "pdfium library is missing bitmap render symbols",
1270            ));
1271        };
1272        let width = c_int::try_from(width_px)
1273            .map_err(|_| EthosError::internal("render bitmap width overflow"))?;
1274        let height = c_int::try_from(height_px)
1275            .map_err(|_| EthosError::internal("render bitmap height overflow"))?;
1276
1277        // SAFETY: width/height are positive bounded c_int values. Bitmap is destroyed by Drop.
1278        let handle = unsafe { bitmap_create(width, height, 1) };
1279        if handle.is_null() {
1280            return Err(EthosError::internal(
1281                "pdfium failed to allocate render bitmap",
1282            ));
1283        }
1284        let mut bitmap = RenderBitmap {
1285            funcs,
1286            handle,
1287            width_px,
1288            height_px,
1289            stride: 0,
1290        };
1291        // SAFETY: handle is a live bitmap. Fill with opaque white for deterministic background.
1292        unsafe { bitmap_fill_rect(bitmap.handle, 0, 0, width, height, 0xFFFF_FFFF) };
1293        // SAFETY: handle and page are live. Render uses no callbacks and writes into the bitmap.
1294        unsafe { render_page_bitmap(bitmap.handle, page, 0, 0, width, height, 0, 0) };
1295        bitmap.stride = bitmap.read_stride()?;
1296        Ok(bitmap)
1297    }
1298
1299    fn read_stride(&self) -> Result<usize, EthosError> {
1300        let Some(bitmap_get_stride) = self.funcs.bitmap_get_stride else {
1301            return Err(EthosError::internal(
1302                "pdfium library is missing bitmap render symbols",
1303            ));
1304        };
1305        // SAFETY: handle is a live bitmap.
1306        let stride = unsafe { bitmap_get_stride(self.handle) };
1307        if stride <= 0 {
1308            return Err(EthosError::internal(
1309                "pdfium render bitmap has invalid stride",
1310            ));
1311        }
1312        usize::try_from(stride).map_err(|_| EthosError::internal("render bitmap stride overflow"))
1313    }
1314
1315    fn crop_bytes(
1316        &self,
1317        x0: u32,
1318        y0: u32,
1319        width_px: u32,
1320        height_px: u32,
1321    ) -> Result<Vec<u8>, EthosError> {
1322        let Some(bitmap_get_buffer) = self.funcs.bitmap_get_buffer else {
1323            return Err(EthosError::internal(
1324                "pdfium library is missing bitmap render symbols",
1325            ));
1326        };
1327        // SAFETY: handle is a live bitmap.
1328        let ptr = unsafe { bitmap_get_buffer(self.handle) };
1329        if ptr.is_null() {
1330            return Err(EthosError::internal("pdfium render bitmap has null buffer"));
1331        }
1332        let full_len = self
1333            .stride
1334            .checked_mul(
1335                usize::try_from(self.height_px)
1336                    .map_err(|_| EthosError::internal("render bitmap height overflow"))?,
1337            )
1338            .ok_or_else(|| EthosError::internal("render bitmap buffer length overflow"))?;
1339        // SAFETY: PDFium owns a live bitmap buffer of stride * height bytes for this bitmap.
1340        let full = unsafe { slice::from_raw_parts(ptr.cast::<u8>(), full_len) };
1341
1342        let x0 = usize::try_from(x0).map_err(|_| EthosError::internal("crop x overflow"))?;
1343        let y0 = usize::try_from(y0).map_err(|_| EthosError::internal("crop y overflow"))?;
1344        let width =
1345            usize::try_from(width_px).map_err(|_| EthosError::internal("crop width overflow"))?;
1346        let height =
1347            usize::try_from(height_px).map_err(|_| EthosError::internal("crop height overflow"))?;
1348        let row_bytes = width
1349            .checked_mul(4)
1350            .ok_or_else(|| EthosError::internal("crop row width overflow"))?;
1351        let mut out = Vec::with_capacity(
1352            row_bytes
1353                .checked_mul(height)
1354                .ok_or_else(|| EthosError::internal("crop buffer length overflow"))?,
1355        );
1356        for row in 0..height {
1357            let src_start = y0
1358                .checked_add(row)
1359                .and_then(|y| y.checked_mul(self.stride))
1360                .and_then(|base| base.checked_add(x0.checked_mul(4)?))
1361                .ok_or_else(|| EthosError::internal("crop source offset overflow"))?;
1362            let src_end = src_start
1363                .checked_add(row_bytes)
1364                .ok_or_else(|| EthosError::internal("crop source row overflow"))?;
1365            if src_end > full.len() {
1366                return Err(EthosError::internal(
1367                    "crop source row exceeds render bitmap",
1368                ));
1369            }
1370            out.extend_from_slice(&full[src_start..src_end]);
1371        }
1372        Ok(out)
1373    }
1374}
1375
1376impl Drop for RenderBitmap<'_> {
1377    fn drop(&mut self) {
1378        if let Some(bitmap_destroy) = self.funcs.bitmap_destroy {
1379            // SAFETY: handle is a live FPDF_BITMAP and is destroyed exactly once here.
1380            unsafe { bitmap_destroy(self.handle) };
1381        }
1382    }
1383}
1384
1385impl PdfTextPage<'_> {
1386    fn geometry_probe(
1387        &self,
1388        page: &Page,
1389        page_height_pts: f64,
1390    ) -> Result<GeometryProbePage, EthosError> {
1391        // SAFETY: handle is a live FPDF_TEXTPAGE.
1392        let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1393        if count < 0 {
1394            return Err(EthosError::new(
1395                ErrorCode::CorruptPdf,
1396                "PDF text page could not be read",
1397            ));
1398        }
1399
1400        let mut chars = Vec::new();
1401        let mut run = GeometryRunBuilder::default();
1402        let mut runs = Vec::new();
1403        let mut next_run = 1u32;
1404        for index in 0..count {
1405            let record = self.geometry_probe_char(index, page_height_pts)?;
1406            match record.parser_action.as_str() {
1407                "include" => {
1408                    if run.has_style_change(&record.font_id, record.font_size_q, record.font_flags)
1409                    {
1410                        run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1411                    }
1412                    run.push(&record);
1413                }
1414                "skip_generated_hyphen" => {}
1415                _ => run.flush(self, page_height_pts, &mut next_run, &mut runs)?,
1416            }
1417            chars.push(record);
1418        }
1419        run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1420
1421        Ok(GeometryProbePage {
1422            id: page.id.clone(),
1423            index: page.index,
1424            width: page.width,
1425            height: page.height,
1426            rotation: page.rotation,
1427            char_count: count,
1428            symbols: self.funcs.geometry_probe_symbols(),
1429            chars,
1430            runs,
1431        })
1432    }
1433
1434    fn geometry_probe_char(
1435        &self,
1436        index: c_int,
1437        page_height_pts: f64,
1438    ) -> Result<GeometryProbeChar, EthosError> {
1439        // SAFETY: index is in range for this text page.
1440        let unicode = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1441        let ch = char::from_u32(unicode);
1442        let parser_action = match ch {
1443            None => "break_invalid_unicode",
1444            Some(_) if self.is_generated_hyphen(index) => "skip_generated_hyphen",
1445            Some(ch) if should_break_text_run(ch) => "break_whitespace_or_control",
1446            Some(_) => "include",
1447        };
1448
1449        let font_info = self.font_info(index);
1450        Ok(GeometryProbeChar {
1451            index,
1452            unicode,
1453            text: ch.map(|ch| ch.to_string()),
1454            parser_action: parser_action.to_string(),
1455            char_box: self.char_bbox(index, page_height_pts)?,
1456            loose_char_box: self.loose_char_bbox(index, page_height_pts)?,
1457            char_origin: self.char_origin(index, page_height_pts)?,
1458            font_id: font_info.font_id,
1459            font_flags: font_info.font_flags,
1460            font_size_q: self.font_size_q(index),
1461        })
1462    }
1463
1464    fn extract_runs(
1465        &self,
1466        page: &Page,
1467        page_height_pts: f64,
1468        next_span: &mut u32,
1469        spans: &mut Vec<Span>,
1470    ) -> Result<(), EthosError> {
1471        // SAFETY: handle is a live FPDF_TEXTPAGE.
1472        let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1473        if count < 0 {
1474            // A PDFium text-page failure invalidates extraction for the whole document.
1475            // Treating it as image-only would hide a backend read failure behind OCR fallback.
1476            return Err(EthosError::new(
1477                ErrorCode::CorruptPdf,
1478                "PDF text page could not be read",
1479            ));
1480        }
1481        if count == 0 {
1482            return Ok(());
1483        }
1484
1485        let mut run = SpanRun::default();
1486        for index in 0..count {
1487            // SAFETY: index is in 0..count for this text page.
1488            let codepoint = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1489            let Some(ch) = char::from_u32(codepoint) else {
1490                run.flush(page, next_span, spans)?;
1491                continue;
1492            };
1493            if self.is_generated_hyphen(index) {
1494                continue;
1495            }
1496            if should_break_text_run(ch) {
1497                run.flush(page, next_span, spans)?;
1498                continue;
1499            }
1500
1501            let Some(bbox) = self.char_bbox(index, page_height_pts)? else {
1502                run.flush(page, next_span, spans)?;
1503                continue;
1504            };
1505            let font_size_q = self.font_size_q(index);
1506            let font_info = self.font_info(index);
1507            if run.has_style_change(&font_info.font_id, font_size_q) {
1508                run.flush(page, next_span, spans)?;
1509            }
1510            let origin = self.char_origin(index, page_height_pts)?;
1511            run.push(ch, bbox, origin, font_info.font_id, font_size_q);
1512        }
1513        run.flush(page, next_span, spans)
1514    }
1515
1516    fn char_bbox(&self, index: c_int, page_height_pts: f64) -> Result<Option<QRect>, EthosError> {
1517        let mut left = 0.0f64;
1518        let mut right = 0.0f64;
1519        let mut bottom = 0.0f64;
1520        let mut top = 0.0f64;
1521        // SAFETY: all pointers refer to initialized local f64 values and index is in range.
1522        let ok = unsafe {
1523            (self.funcs.text_get_char_box)(
1524                self.handle,
1525                index,
1526                &mut left,
1527                &mut right,
1528                &mut bottom,
1529                &mut top,
1530            )
1531        };
1532        if ok == 0 {
1533            return Ok(None);
1534        }
1535        Ok(Some(qrect_from_pdfium_char_box(
1536            page_height_pts,
1537            left,
1538            right,
1539            bottom,
1540            top,
1541        )?))
1542    }
1543
1544    fn loose_char_bbox(
1545        &self,
1546        index: c_int,
1547        page_height_pts: f64,
1548    ) -> Result<Option<QRect>, EthosError> {
1549        let Some(get_loose_char_box) = self.funcs.text_get_loose_char_box else {
1550            return Ok(None);
1551        };
1552        let mut rect = FsRectF::default();
1553        // SAFETY: rect points to initialized writable storage and index is in range.
1554        let ok = unsafe { get_loose_char_box(self.handle, index, &mut rect) };
1555        if ok == 0 {
1556            return Ok(None);
1557        }
1558        Ok(Some(qrect_from_pdfium_char_box(
1559            page_height_pts,
1560            f64::from(rect.left),
1561            f64::from(rect.right),
1562            f64::from(rect.bottom),
1563            f64::from(rect.top),
1564        )?))
1565    }
1566
1567    fn char_origin(
1568        &self,
1569        index: c_int,
1570        page_height_pts: f64,
1571    ) -> Result<Option<[i64; 2]>, EthosError> {
1572        let Some(get_char_origin) = self.funcs.text_get_char_origin else {
1573            return Ok(None);
1574        };
1575        let mut x = 0.0f64;
1576        let mut y = 0.0f64;
1577        // SAFETY: pointers refer to initialized writable f64 values and index is in range.
1578        let ok = unsafe { get_char_origin(self.handle, index, &mut x, &mut y) };
1579        if ok == 0 {
1580            return Ok(None);
1581        }
1582        Ok(Some([
1583            quantize_coord(x)?,
1584            quantize_coord(page_height_pts - y)?,
1585        ]))
1586    }
1587
1588    fn text_rects(
1589        &self,
1590        char_start: c_int,
1591        char_count: c_int,
1592        page_height_pts: f64,
1593    ) -> Result<Vec<QRect>, EthosError> {
1594        let (Some(count_rects), Some(get_rect)) =
1595            (self.funcs.text_count_rects, self.funcs.text_get_rect)
1596        else {
1597            return Ok(Vec::new());
1598        };
1599        if char_count <= 0 {
1600            return Ok(Vec::new());
1601        }
1602        // SAFETY: char_start/char_count identify a range observed from this text page.
1603        let rect_count = unsafe { count_rects(self.handle, char_start, char_count) };
1604        if rect_count <= 0 {
1605            return Ok(Vec::new());
1606        }
1607        let mut rects = Vec::new();
1608        for rect_index in 0..rect_count {
1609            let mut left = 0.0f64;
1610            let mut top = 0.0f64;
1611            let mut right = 0.0f64;
1612            let mut bottom = 0.0f64;
1613            // SAFETY: pointers refer to initialized writable f64 values.
1614            let ok = unsafe {
1615                get_rect(
1616                    self.handle,
1617                    rect_index,
1618                    &mut left,
1619                    &mut top,
1620                    &mut right,
1621                    &mut bottom,
1622                )
1623            };
1624            if ok != 0 {
1625                rects.push(qrect_from_pdfium_char_box(
1626                    page_height_pts,
1627                    left,
1628                    right,
1629                    bottom,
1630                    top,
1631                )?);
1632            }
1633        }
1634        Ok(rects)
1635    }
1636
1637    fn font_size_q(&self, index: c_int) -> Option<i64> {
1638        // SAFETY: index is in range.
1639        let size = unsafe { (self.funcs.text_get_font_size)(self.handle, index) };
1640        if size <= 0.0 {
1641            return None;
1642        }
1643        quantize(size, QUANTUM_PER_POINT).ok()
1644    }
1645
1646    fn font_info(&self, index: c_int) -> PdfFontInfo {
1647        let Some(get_font_info) = self.funcs.text_get_font_info else {
1648            return PdfFontInfo::default();
1649        };
1650        // SAFETY: index is in range; null buffer asks PDFium for the UTF-8 byte length.
1651        let len =
1652            unsafe { (get_font_info)(self.handle, index, ptr::null_mut(), 0, ptr::null_mut()) };
1653        if len == 0 || len > 4096 {
1654            return PdfFontInfo::default();
1655        }
1656
1657        let Ok(len_usize) = usize::try_from(len) else {
1658            return PdfFontInfo::default();
1659        };
1660        let mut buffer = vec![0u8; len_usize];
1661        let mut flags = 0;
1662        // SAFETY: buffer is writable for len bytes; flags points to initialized storage.
1663        let written = unsafe {
1664            (get_font_info)(
1665                self.handle,
1666                index,
1667                buffer.as_mut_ptr().cast(),
1668                len,
1669                &mut flags,
1670            )
1671        };
1672        if written == 0 || written > len {
1673            return PdfFontInfo::default();
1674        }
1675        let nul = buffer.iter().position(|b| *b == 0).unwrap_or(buffer.len());
1676        let raw = std::str::from_utf8(&buffer[..nul]).ok();
1677        PdfFontInfo {
1678            font_id: raw.and_then(deterministic_font_id),
1679            font_flags: u32::try_from(flags).ok(),
1680        }
1681    }
1682
1683    fn is_generated_hyphen(&self, index: c_int) -> bool {
1684        let (Some(text_is_generated), Some(text_is_hyphen)) =
1685            (self.funcs.text_is_generated, self.funcs.text_is_hyphen)
1686        else {
1687            return false;
1688        };
1689        // SAFETY: index is in range for this text page.
1690        unsafe {
1691            text_is_generated(self.handle, index) == 1 && text_is_hyphen(self.handle, index) == 1
1692        }
1693    }
1694}
1695
1696impl Drop for PdfTextPage<'_> {
1697    fn drop(&mut self) {
1698        // SAFETY: handle is a live FPDF_TEXTPAGE and is closed exactly once here.
1699        unsafe { (self.funcs.text_close_page)(self.handle) };
1700    }
1701}
1702
1703fn should_break_text_run(ch: char) -> bool {
1704    ch == '\0' || ch.is_whitespace() || ch.is_control()
1705}
1706
1707#[derive(Default)]
1708struct SpanRun {
1709    text: String,
1710    bbox: Option<QRect>,
1711    first_origin: Option<[i64; 2]>,
1712    last_origin: Option<[i64; 2]>,
1713    font_id: Option<String>,
1714    font_size_q: Option<i64>,
1715}
1716
1717#[derive(Default)]
1718struct GeometryRunBuilder {
1719    text: String,
1720    char_indices: Vec<i32>,
1721    char_box_union: Option<QRect>,
1722    loose_char_box_union: Option<QRect>,
1723    first_origin: Option<[i64; 2]>,
1724    last_origin: Option<[i64; 2]>,
1725    font_id: Option<String>,
1726    font_size_q: Option<i64>,
1727    font_flags: Option<u32>,
1728}
1729
1730#[derive(Default)]
1731struct PdfFontInfo {
1732    font_id: Option<String>,
1733    font_flags: Option<u32>,
1734}
1735
1736#[derive(Debug, Deserialize)]
1737struct FontSubstitutionTable {
1738    schema_version: String,
1739    table_id: String,
1740    version: String,
1741    default_unresolved_font_id: String,
1742    mappings: Vec<FontSubstitutionMapping>,
1743}
1744
1745#[derive(Debug, Deserialize)]
1746struct FontSubstitutionMapping {
1747    source: String,
1748    font_id: String,
1749}
1750
1751impl SpanRun {
1752    fn has_style_change(&self, font_id: &Option<String>, font_size_q: Option<i64>) -> bool {
1753        !self.text.is_empty() && (self.font_id != *font_id || self.font_size_q != font_size_q)
1754    }
1755
1756    fn push(
1757        &mut self,
1758        ch: char,
1759        bbox: QRect,
1760        origin: Option<[i64; 2]>,
1761        font_id: Option<String>,
1762        font_size_q: Option<i64>,
1763    ) {
1764        self.text.push(ch);
1765        self.bbox = Some(match self.bbox {
1766            Some(existing) => union_rect(existing, bbox),
1767            None => bbox,
1768        });
1769        if self.first_origin.is_none() {
1770            self.first_origin = origin;
1771        }
1772        self.last_origin = origin;
1773        if self.font_id.is_none() {
1774            self.font_id = font_id;
1775        }
1776        if self.font_size_q.is_none() {
1777            self.font_size_q = font_size_q;
1778        }
1779    }
1780
1781    fn flush(
1782        &mut self,
1783        page: &Page,
1784        next_span: &mut u32,
1785        spans: &mut Vec<Span>,
1786    ) -> Result<(), EthosError> {
1787        if self.text.is_empty() {
1788            return Ok(());
1789        }
1790        let bbox = self
1791            .bbox
1792            .ok_or_else(|| EthosError::internal("span run has text without bbox"))?;
1793        let origin_locator = match (self.first_origin.take(), self.last_origin.take()) {
1794            (Some(first_origin), Some(last_origin)) => Some(SpanOriginLocator {
1795                policy: ORIGIN_LOCATOR_POLICY.to_string(),
1796                first_origin,
1797                last_origin,
1798            }),
1799            _ => None,
1800        };
1801        spans.push(Span {
1802            id: span_id(*next_span)?,
1803            page: page.id.clone(),
1804            bbox,
1805            origin_locator,
1806            text: std::mem::take(&mut self.text),
1807            font_id: self.font_id.take(),
1808            font_size_q: self.font_size_q,
1809            char_start: None,
1810            char_end: None,
1811            warning_refs: Vec::new(),
1812        });
1813        *next_span += 1;
1814        self.bbox = None;
1815        self.first_origin = None;
1816        self.last_origin = None;
1817        self.font_id = None;
1818        self.font_size_q = None;
1819        Ok(())
1820    }
1821}
1822
1823impl GeometryRunBuilder {
1824    fn has_style_change(
1825        &self,
1826        font_id: &Option<String>,
1827        font_size_q: Option<i64>,
1828        font_flags: Option<u32>,
1829    ) -> bool {
1830        !self.text.is_empty()
1831            && (self.font_id != *font_id
1832                || self.font_size_q != font_size_q
1833                || self.font_flags != font_flags)
1834    }
1835
1836    fn push(&mut self, ch: &GeometryProbeChar) {
1837        if let Some(text) = &ch.text {
1838            self.text.push_str(text);
1839        }
1840        self.char_indices.push(ch.index);
1841        self.char_box_union = union_option_rect(self.char_box_union, ch.char_box);
1842        self.loose_char_box_union = union_option_rect(self.loose_char_box_union, ch.loose_char_box);
1843        if self.first_origin.is_none() {
1844            self.first_origin = ch.char_origin;
1845        }
1846        self.last_origin = ch.char_origin;
1847        if self.font_id.is_none() {
1848            self.font_id = ch.font_id.clone();
1849        }
1850        if self.font_size_q.is_none() {
1851            self.font_size_q = ch.font_size_q;
1852        }
1853        if self.font_flags.is_none() {
1854            self.font_flags = ch.font_flags;
1855        }
1856    }
1857
1858    fn flush(
1859        &mut self,
1860        text_page: &PdfTextPage<'_>,
1861        page_height_pts: f64,
1862        next_run: &mut u32,
1863        runs: &mut Vec<GeometryProbeRun>,
1864    ) -> Result<(), EthosError> {
1865        if self.text.is_empty() {
1866            return Ok(());
1867        }
1868        let char_start = self.char_indices.first().copied().unwrap_or_default();
1869        let char_end = self
1870            .char_indices
1871            .last()
1872            .copied()
1873            .map(|index| index + 1)
1874            .unwrap_or(char_start);
1875        let text_rects =
1876            text_page.text_rects(char_start, char_end - char_start, page_height_pts)?;
1877        runs.push(GeometryProbeRun {
1878            index: *next_run,
1879            text: std::mem::take(&mut self.text),
1880            char_start,
1881            char_end,
1882            char_indices: std::mem::take(&mut self.char_indices),
1883            char_box_union: self.char_box_union.take(),
1884            loose_char_box_union: self.loose_char_box_union.take(),
1885            text_rect_union: union_rects(text_rects.iter().copied()),
1886            text_rects,
1887            first_origin: self.first_origin.take(),
1888            last_origin: self.last_origin.take(),
1889            font_id: self.font_id.take(),
1890            font_flags: self.font_flags.take(),
1891            font_size_q: self.font_size_q.take(),
1892        });
1893        *next_run += 1;
1894        self.font_size_q = None;
1895        self.font_flags = None;
1896        Ok(())
1897    }
1898}
1899
1900fn union_option_rect(existing: Option<QRect>, next: Option<QRect>) -> Option<QRect> {
1901    match (existing, next) {
1902        (Some(a), Some(b)) => Some(union_rect(a, b)),
1903        (Some(a), None) => Some(a),
1904        (None, Some(b)) => Some(b),
1905        (None, None) => None,
1906    }
1907}
1908
1909fn union_rects(mut rects: impl Iterator<Item = QRect>) -> Option<QRect> {
1910    let first = rects.next()?;
1911    Some(rects.fold(first, union_rect))
1912}
1913
1914fn deterministic_font_id(raw_name: &str) -> Option<String> {
1915    let raw_name = raw_name.trim();
1916    if raw_name.is_empty() {
1917        return None;
1918    }
1919    let (name, subset) = strip_subset_prefix(raw_name);
1920    if subset {
1921        if let Some(normalized) = normalize_font_name(name) {
1922            if is_safe_font_id_suffix(&normalized) {
1923                return Some(format!("embedded:{normalized}"));
1924            }
1925        }
1926        return Some(hashed_embedded_font_id(name));
1927    }
1928    let normalized = normalize_font_name(name)?;
1929    font_substitution(&normalized)
1930        .or_else(|| Some(font_substitution_table().default_unresolved_font_id.clone()))
1931}
1932
1933fn strip_subset_prefix(name: &str) -> (&str, bool) {
1934    let bytes = name.as_bytes();
1935    if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(u8::is_ascii_uppercase) {
1936        (&name[7..], true)
1937    } else {
1938        (name, false)
1939    }
1940}
1941
1942fn normalize_font_name(name: &str) -> Option<String> {
1943    let mut out = String::new();
1944    let mut previous_dash = false;
1945    for ch in name.trim().chars() {
1946        let mapped = if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
1947            ch
1948        } else if ch.is_whitespace()
1949            || ch.is_control()
1950            || matches!(ch, '/' | '\\' | ':' | ',' | '(' | ')' | '[' | ']')
1951        {
1952            '-'
1953        } else {
1954            ch
1955        };
1956        if mapped == '-' {
1957            if previous_dash {
1958                continue;
1959            }
1960            previous_dash = true;
1961        } else {
1962            previous_dash = false;
1963        }
1964        out.push(mapped);
1965    }
1966    let out = out.trim_matches('-').to_string();
1967    (!out.is_empty()).then_some(out)
1968}
1969
1970fn is_safe_font_id_suffix(name: &str) -> bool {
1971    !name.is_empty()
1972        && name
1973            .bytes()
1974            .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b'.'))
1975}
1976
1977fn hashed_embedded_font_id(name: &str) -> String {
1978    format!(
1979        "embedded:sha256-{}",
1980        ethos_core::c14n::sha256_hex_bytes(name.as_bytes())
1981    )
1982}
1983
1984fn font_substitution(name: &str) -> Option<String> {
1985    font_substitution_table()
1986        .mappings
1987        .iter()
1988        .find(|mapping| mapping.source == name)
1989        .map(|mapping| mapping.font_id.clone())
1990}
1991
1992fn font_substitution_table() -> &'static FontSubstitutionTable {
1993    FONT_SUBSTITUTION_TABLE.get_or_init(|| {
1994        let table: FontSubstitutionTable = serde_json::from_str(FONT_SUBSTITUTION_TABLE_JSON)
1995            .expect("bundled font-substitution-table.json is valid JSON");
1996        validate_font_substitution_table(&table)
1997            .expect("bundled font-substitution-table.json is internally valid");
1998        table
1999    })
2000}
2001
2002fn validate_font_substitution_table(table: &FontSubstitutionTable) -> Result<(), &'static str> {
2003    if table.schema_version != "1.0.0"
2004        || table.table_id != "ethos-font-substitution-v1"
2005        || table.version != "1.0.0"
2006        || table.default_unresolved_font_id != "subst:liberation-sans-regular"
2007    {
2008        return Err("unexpected font substitution table metadata");
2009    }
2010
2011    let mut seen = HashSet::new();
2012    for mapping in &table.mappings {
2013        if mapping.source.is_empty() || !mapping.font_id.starts_with("subst:") {
2014            return Err("malformed font substitution mapping");
2015        }
2016        if !seen.insert(mapping.source.as_str()) {
2017            return Err("duplicate font substitution mapping source");
2018        }
2019    }
2020
2021    Ok(())
2022}
2023
2024#[cfg(unix)]
2025mod dylib {
2026    use super::*;
2027    use std::os::unix::ffi::OsStrExt;
2028
2029    const RTLD_NOW: c_int = 2;
2030
2031    unsafe extern "C" {
2032        fn dlopen(filename: *const c_char, flag: c_int) -> *mut c_void;
2033        fn dlsym(handle: *mut c_void, symbol: *const c_char) -> *mut c_void;
2034        fn dlclose(handle: *mut c_void) -> c_int;
2035    }
2036
2037    pub(super) struct Library {
2038        handle: *mut c_void,
2039    }
2040
2041    impl Library {
2042        pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2043            let c_path = CString::new(path.as_os_str().as_bytes()).map_err(|_| {
2044                EthosError::internal("pdfium library path contains an interior NUL byte")
2045            })?;
2046            // SAFETY: c_path is NUL-terminated and lives for the call.
2047            let handle = unsafe { dlopen(c_path.as_ptr(), RTLD_NOW) };
2048            if handle.is_null() {
2049                Err(EthosError::internal(
2050                    "failed to load configured pdfium library",
2051                ))
2052            } else {
2053                Ok(Library { handle })
2054            }
2055        }
2056
2057        pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2058            let ptr = self.symbol_ptr(name);
2059            if ptr.is_null() {
2060                return Err(EthosError::internal(format!(
2061                    "pdfium library is missing symbol {}",
2062                    symbol_name(name)
2063                )));
2064            }
2065            assert_symbol_pointer_size::<T>();
2066            // SAFETY: caller chooses T to match the named PDFium C symbol.
2067            Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2068        }
2069
2070        pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2071            let ptr = self.symbol_ptr(name);
2072            if ptr.is_null() {
2073                None
2074            } else {
2075                assert_symbol_pointer_size::<T>();
2076                // SAFETY: caller chooses T to match the named PDFium C symbol.
2077                Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2078            }
2079        }
2080
2081        fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2082            // SAFETY: handle is live; name is a static NUL-terminated symbol name.
2083            unsafe { dlsym(self.handle, name.as_ptr().cast()) }
2084        }
2085    }
2086
2087    impl Drop for Library {
2088        fn drop(&mut self) {
2089            if !self.handle.is_null() {
2090                // SAFETY: handle was returned by dlopen and is closed exactly once.
2091                unsafe {
2092                    let _ = dlclose(self.handle);
2093                }
2094            }
2095        }
2096    }
2097}
2098
2099#[cfg(windows)]
2100mod dylib {
2101    use super::*;
2102    use std::os::windows::ffi::OsStrExt;
2103
2104    unsafe extern "system" {
2105        fn LoadLibraryW(lp_lib_file_name: *const u16) -> *mut c_void;
2106        fn GetProcAddress(h_module: *mut c_void, lp_proc_name: *const c_char) -> *mut c_void;
2107        fn FreeLibrary(h_lib_module: *mut c_void) -> c_int;
2108    }
2109
2110    pub(super) struct Library {
2111        handle: *mut c_void,
2112    }
2113
2114    impl Library {
2115        pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2116            let mut wide_path: Vec<u16> = path.as_os_str().encode_wide().collect();
2117            if wide_path.contains(&0) {
2118                return Err(EthosError::internal(
2119                    "pdfium library path contains an interior NUL code unit",
2120                ));
2121            }
2122            wide_path.push(0);
2123            // SAFETY: wide_path is NUL-terminated and lives for the call.
2124            let handle = unsafe { LoadLibraryW(wide_path.as_ptr()) };
2125            if handle.is_null() {
2126                Err(EthosError::internal(
2127                    "failed to load configured pdfium library",
2128                ))
2129            } else {
2130                Ok(Library { handle })
2131            }
2132        }
2133
2134        pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2135            let ptr = self.symbol_ptr(name);
2136            if ptr.is_null() {
2137                return Err(EthosError::internal(format!(
2138                    "pdfium library is missing symbol {}",
2139                    symbol_name(name)
2140                )));
2141            }
2142            assert_symbol_pointer_size::<T>();
2143            // SAFETY: caller chooses T to match the named PDFium C symbol.
2144            Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2145        }
2146
2147        pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2148            let ptr = self.symbol_ptr(name);
2149            if ptr.is_null() {
2150                None
2151            } else {
2152                assert_symbol_pointer_size::<T>();
2153                // SAFETY: caller chooses T to match the named PDFium C symbol.
2154                Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2155            }
2156        }
2157
2158        fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2159            // SAFETY: handle is live; name is a static NUL-terminated symbol name.
2160            unsafe { GetProcAddress(self.handle, name.as_ptr().cast()) }
2161        }
2162    }
2163
2164    impl Drop for Library {
2165        fn drop(&mut self) {
2166            if !self.handle.is_null() {
2167                // SAFETY: handle was returned by LoadLibraryW and is closed exactly once.
2168                unsafe {
2169                    let _ = FreeLibrary(self.handle);
2170                }
2171            }
2172        }
2173    }
2174}
2175
2176fn assert_symbol_pointer_size<T>() {
2177    const {
2178        assert!(
2179            std::mem::size_of::<T>() == std::mem::size_of::<*mut c_void>(),
2180            "pdfium symbol pointer size mismatch"
2181        );
2182    }
2183}
2184
2185fn symbol_name(name: &'static [u8]) -> String {
2186    let name = name.strip_suffix(b"\0").unwrap_or(name);
2187    String::from_utf8_lossy(name).into_owned()
2188}
2189
2190#[cfg(test)]
2191mod tests {
2192    use super::*;
2193
2194    #[test]
2195    fn invalid_pdf_fails_before_library_load() {
2196        let err = PdfiumBackend::default()
2197            .page_count(b"not a pdf")
2198            .unwrap_err();
2199        assert_eq!(err.code, ErrorCode::InvalidPdf);
2200    }
2201
2202    #[test]
2203    fn text_run_breaks_on_pdfium_control_characters() {
2204        assert!(should_break_text_run('\0'));
2205        assert!(should_break_text_run('\n'));
2206        assert!(should_break_text_run('\u{0002}'));
2207        assert!(!should_break_text_run('-'));
2208        assert!(!should_break_text_run('A'));
2209    }
2210
2211    #[test]
2212    fn missing_library_path_is_stable_error_for_pdf_input() {
2213        let backend = PdfiumBackend::default();
2214        if env::var_os(PDFIUM_LIBRARY_PATH_ENV).is_some() {
2215            return;
2216        }
2217        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2218        assert_eq!(err.code, ErrorCode::InternalError);
2219        assert!(err.message.contains(PDFIUM_LIBRARY_PATH_ENV));
2220        assert!(err.message.contains("ethos doctor"));
2221        assert!(err.message.contains("ethos doctor --require-pdfium"));
2222        assert!(err.message.contains("docs/pdfium-manual-setup.md"));
2223    }
2224
2225    #[test]
2226    fn render_crop_raw_rejects_zero_page_before_library_load() {
2227        let err = PdfiumBackend::default()
2228            .render_crop_raw(b"%PDF-1.7\n", 0, QRect::new(0, 0, 100, 100).unwrap())
2229            .unwrap_err();
2230        assert_eq!(err.code, ErrorCode::PageLimitExceeded);
2231        assert_eq!(err.message, "page selection out of document range");
2232    }
2233
2234    #[test]
2235    fn crop_window_uses_outward_quantized_pixel_bounds() {
2236        assert_eq!(
2237            crop_window(QRect::new(7392, 5482, 19378, 7226).unwrap(), 300, 144).unwrap(),
2238            (73, 54, 121, 19)
2239        );
2240        assert_eq!(
2241            crop_window(QRect::new(-50, -50, 30100, 14500).unwrap(), 300, 144).unwrap(),
2242            (0, 0, 300, 144)
2243        );
2244
2245        let err = crop_window(QRect::new(100, 100, 101, 101).unwrap(), 1, 1).unwrap_err();
2246        assert_eq!(err.code, ErrorCode::InternalError);
2247        assert_eq!(err.message, "crop bbox has no positive pixel extent");
2248    }
2249
2250    #[test]
2251    fn render_crop_raw_is_deterministic_when_pdfium_is_configured() {
2252        let Some(path) = env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from) else {
2253            return;
2254        };
2255        if !path.is_file() {
2256            return;
2257        }
2258
2259        let fixture = Path::new(env!("CARGO_MANIFEST_DIR"))
2260            .join("../../fixtures/synthetic/simple-text/document.pdf");
2261        let pdf_bytes = std::fs::read(fixture).unwrap();
2262        let bbox = QRect::new(7392, 5482, 19378, 7226).unwrap();
2263        let backend = PdfiumBackend::default();
2264
2265        let first = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2266        let second = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2267
2268        assert_eq!(first, second);
2269        assert_eq!(first.page_index, 1);
2270        assert_eq!(first.bbox, bbox);
2271        assert_eq!(first.width_px, 121);
2272        assert_eq!(first.height_px, 19);
2273        assert_eq!(first.stride, first.width_px * 4);
2274        assert_eq!(first.pixel_format, "bgra_8u");
2275        assert_eq!(
2276            first.bytes.len(),
2277            usize::try_from(first.stride * first.height_px).unwrap()
2278        );
2279        assert_eq!(
2280            first.sha256,
2281            ethos_core::c14n::sha256_hex_bytes(&first.bytes)
2282        );
2283        assert!(first
2284            .bytes
2285            .chunks_exact(4)
2286            .any(|pixel| pixel != [255, 255, 255, 255]));
2287    }
2288
2289    #[test]
2290    fn invalid_configured_library_path_does_not_leak_host_path() {
2291        let path = env::temp_dir().join("ethos-missing-libpdfium\nwith-control.dylib");
2292        let backend = PdfiumBackend::from_library_path(&path);
2293        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2294        assert_eq!(err.code, ErrorCode::InternalError);
2295        assert!(err
2296            .message
2297            .contains("pdfium library path does not point to a file"));
2298        assert!(err.message.contains("ethos doctor"));
2299        assert!(err.message.contains("ethos doctor --require-pdfium"));
2300        assert!(err.message.contains("docs/pdfium-manual-setup.md"));
2301        assert!(!err.message.contains(path.to_string_lossy().as_ref()));
2302    }
2303
2304    #[test]
2305    fn explicit_manifest_hashes_library_bytes() {
2306        let path = env::temp_dir().join("ethos-test-libpdfium-hash.bin");
2307        std::fs::write(&path, b"pdfium bytes").unwrap();
2308        let backend = PdfiumBackend::from_library_path(&path).with_version("test-version");
2309        let manifest = backend.manifest();
2310        assert_eq!(manifest.id, "pdfium");
2311        assert_eq!(manifest.phase, 1);
2312        assert_eq!(manifest.version, "test-version");
2313        assert_eq!(
2314            manifest.platform_sha256,
2315            ethos_core::c14n::sha256_hex_bytes(b"pdfium bytes")
2316        );
2317        let _ = std::fs::remove_file(path);
2318    }
2319
2320    #[test]
2321    fn phase1_pdfium_profile_is_pinned_and_v8_xfa_disabled() {
2322        let profile = pinned_pdfium_profile();
2323        assert_eq!(profile.id, "pdfium");
2324        assert_eq!(profile.phase, 1);
2325        assert_eq!(profile.version, "chromium/7881");
2326        assert_eq!(profile.upstream_version, "PDFium 151.0.7881.0");
2327        assert_eq!(profile.v8, "disabled");
2328        assert_eq!(profile.xfa, "disabled");
2329        assert_eq!(profile.distribution.source, "bblanchon/pdfium-binaries");
2330        assert_eq!(
2331            profile.distribution.attestation.sha256,
2332            "24dec7cd76acb81106a0c29b908cceceef8215b050f6ff6ffbf875465811ef60"
2333        );
2334        assert!(!profile.build_flags.pdf_enable_v8);
2335        assert!(!profile.build_flags.pdf_enable_xfa);
2336        assert!(profile.build_flags.pdf_is_standalone);
2337
2338        let expected = [
2339            (
2340                "macos-arm64",
2341                "pdfium-mac-arm64.tgz",
2342                "52e94ca5aa8847934330daf3f8150c190682c5ca93831468794f8b90d4392e40",
2343                "lib/libpdfium.dylib",
2344                "1bc45b15466b34cef96641ce25c77a876e70010c6b114f909dda2f5325fc5bd7",
2345            ),
2346            (
2347                "linux-x64",
2348                "pdfium-linux-x64.tgz",
2349                "1470e21b8b4a3b4ad7f85684e2da11d94f3b69a86d81dee11b9b6709d927ac1d",
2350                "lib/libpdfium.so",
2351                "f728930966f503652b92acc89b9374a2eeca00ce42e26dccd3e4b5c5161b2d64",
2352            ),
2353            (
2354                "windows-x64",
2355                "pdfium-win-x64.tgz",
2356                "73cc0de638ac2095e7445bf56a38200a5b7c7ca0e9f4ba144598f2457377ac08",
2357                "bin/pdfium.dll",
2358                "79d4676b656cfb1abcea88f9ade3b4b0826c5200382db5f4ec72a636c598c118",
2359            ),
2360        ];
2361        for (platform, name, archive_sha256, runtime_path, runtime_sha256) in expected {
2362            assert_eq!(profile.platform_hashes[platform], archive_sha256);
2363            let artifact = &profile.platform_artifacts[platform];
2364            assert_eq!(artifact.name, name);
2365            assert!(!artifact.name.contains("-v8-"));
2366            assert!(!artifact.name.contains("xfa"));
2367            assert_eq!(artifact.runtime_library_path, runtime_path);
2368            assert_eq!(artifact.runtime_library_sha256, runtime_sha256);
2369        }
2370    }
2371
2372    #[test]
2373    fn mismatched_pdfium_version_is_rejected_before_library_load() {
2374        if current_platform_key().is_none() {
2375            return;
2376        }
2377        let path = env::temp_dir().join("ethos-test-libpdfium-version-mismatch.bin");
2378        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2379        let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7869");
2380        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2381        assert_eq!(err.code, ErrorCode::InternalError);
2382        assert_eq!(
2383            err.message,
2384            "pdfium version does not match pinned phase 1 profile"
2385        );
2386        let _ = std::fs::remove_file(path);
2387    }
2388
2389    #[test]
2390    fn pinned_upstream_pdfium_version_alias_is_accepted() {
2391        if current_platform_key().is_none() {
2392            return;
2393        }
2394        let path = env::temp_dir().join("ethos-test-libpdfium-upstream-version.bin");
2395        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2396        let backend = PdfiumBackend::from_library_path(&path).with_version("PDFium 151.0.7881.0");
2397        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2398        assert_eq!(err.code, ErrorCode::InternalError);
2399        assert_eq!(
2400            err.message,
2401            "pdfium library does not match pinned phase 1 profile"
2402        );
2403        let _ = std::fs::remove_file(path);
2404    }
2405
2406    #[test]
2407    fn mismatched_pdfium_artifact_is_rejected_with_stable_error() {
2408        if current_platform_key().is_none() {
2409            return;
2410        }
2411        let library_path = env::temp_dir().join("ethos-test-libpdfium-artifact-mismatch.bin");
2412        let artifact_path = env::temp_dir().join("ethos-test-pdfium-artifact-mismatch.tgz");
2413        std::fs::write(&library_path, b"not the pinned pdfium library").unwrap();
2414        std::fs::write(&artifact_path, b"not the pinned pdfium artifact").unwrap();
2415        let backend = PdfiumBackend::from_library_path(&library_path)
2416            .with_version("chromium/7881")
2417            .with_artifact_path(&artifact_path);
2418        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2419        assert_eq!(err.code, ErrorCode::InternalError);
2420        assert_eq!(
2421            err.message,
2422            "pdfium artifact does not match pinned phase 1 profile"
2423        );
2424        let _ = std::fs::remove_file(library_path);
2425        let _ = std::fs::remove_file(artifact_path);
2426    }
2427
2428    #[test]
2429    fn mismatched_pdfium_library_is_rejected_before_dynamic_load() {
2430        if current_platform_key().is_none() {
2431            return;
2432        }
2433        let path = env::temp_dir().join("ethos-test-libpdfium-library-mismatch.bin");
2434        std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2435        let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7881");
2436        let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2437        assert_eq!(err.code, ErrorCode::InternalError);
2438        assert_eq!(
2439            err.message,
2440            "pdfium library does not match pinned phase 1 profile"
2441        );
2442        let _ = std::fs::remove_file(path);
2443    }
2444
2445    #[test]
2446    fn deterministic_font_ids_strip_subset_prefixes() {
2447        assert_eq!(
2448            deterministic_font_id("ABCDEF+MinionPro-Regular").as_deref(),
2449            Some("embedded:MinionPro-Regular")
2450        );
2451        assert_eq!(
2452            deterministic_font_id("Helvetica-Bold").as_deref(),
2453            Some("subst:liberation-sans-bold")
2454        );
2455        assert_eq!(
2456            deterministic_font_id("Helvetica").as_deref(),
2457            Some("subst:liberation-sans-regular")
2458        );
2459        assert_eq!(
2460            deterministic_font_id("Helvetica-Oblique").as_deref(),
2461            Some("subst:liberation-sans-italic")
2462        );
2463        assert_eq!(
2464            deterministic_font_id("Helvetica-BoldOblique").as_deref(),
2465            Some("subst:liberation-sans-bold-italic")
2466        );
2467        assert_eq!(
2468            deterministic_font_id("Courier").as_deref(),
2469            Some("subst:liberation-mono-regular")
2470        );
2471        assert_eq!(
2472            deterministic_font_id("Times-Roman").as_deref(),
2473            Some("subst:liberation-serif-regular")
2474        );
2475        assert_eq!(
2476            deterministic_font_id("Custom Font/Regular").as_deref(),
2477            Some("subst:liberation-sans-regular")
2478        );
2479        assert_eq!(deterministic_font_id("   "), None);
2480    }
2481
2482    #[test]
2483    fn deterministic_font_ids_keep_embedded_ids_ascii_only() {
2484        let unsafe_unicode = deterministic_font_id("ABCDEF+明朝").unwrap();
2485        assert_eq!(unsafe_unicode, hashed_embedded_font_id("明朝"));
2486        assert!(unsafe_unicode.is_ascii());
2487
2488        let unsafe_punctuation = deterministic_font_id("ABCDEF+Fixture+Font").unwrap();
2489        assert_eq!(unsafe_punctuation, hashed_embedded_font_id("Fixture+Font"));
2490        assert!(unsafe_punctuation.is_ascii());
2491
2492        let separator_only = deterministic_font_id("ABCDEF+///").unwrap();
2493        assert_eq!(separator_only, hashed_embedded_font_id("///"));
2494        assert!(separator_only.is_ascii());
2495
2496        assert_eq!(
2497            deterministic_font_id("明朝").as_deref(),
2498            Some("subst:liberation-sans-regular")
2499        );
2500    }
2501
2502    #[test]
2503    fn font_substitution_table_is_well_formed() {
2504        use std::collections::HashSet;
2505
2506        let table = font_substitution_table();
2507        assert_eq!(table.schema_version, "1.0.0");
2508        assert_eq!(table.table_id, "ethos-font-substitution-v1");
2509        assert_eq!(table.version, "1.0.0");
2510        assert_eq!(
2511            table.default_unresolved_font_id,
2512            "subst:liberation-sans-regular"
2513        );
2514
2515        let mut seen = HashSet::new();
2516        for mapping in &table.mappings {
2517            assert!(!mapping.source.is_empty());
2518            assert!(mapping.font_id.starts_with("subst:"));
2519            assert!(
2520                seen.insert(mapping.source.as_str()),
2521                "duplicate font substitution source {}",
2522                mapping.source
2523            );
2524        }
2525        assert_eq!(table.mappings.len(), 14);
2526    }
2527
2528    #[test]
2529    fn profile_pins_font_substitution_table_bytes() {
2530        const FONT_SUBSTITUTION_TABLE_PATH: &str =
2531            "crates/ethos-pdf/assets/font-substitution-table.json";
2532        let profile: serde_json::Value = serde_json::from_str(include_str!(concat!(
2533            env!("CARGO_MANIFEST_DIR"),
2534            "/../../profiles/ethos-deterministic-v1.json"
2535        )))
2536        .unwrap();
2537        let pin = &profile["font_policy"]["substitution_table"];
2538        assert_eq!(pin["path"], FONT_SUBSTITUTION_TABLE_PATH);
2539        assert_eq!(
2540            pin["sha256"],
2541            ethos_core::c14n::sha256_hex_bytes(FONT_SUBSTITUTION_TABLE_JSON.as_bytes())
2542        );
2543    }
2544}