1#![deny(unsafe_op_in_unsafe_fn)]
28#![warn(missing_docs)]
29
30use std::collections::{BTreeMap, HashSet};
31use std::env;
32use std::ffi::{c_char, c_int, c_ulong, c_void, CString};
33use std::path::{Path, PathBuf};
34use std::ptr;
35use std::slice;
36use std::sync::{Mutex, OnceLock};
37
38use ethos_core::codes::WarningCode;
39use ethos_core::config::{PageSelection, ParseConfig};
40use ethos_core::error::{ErrorCode, EthosError};
41use ethos_core::geom::{quantize, QRect};
42use ethos_core::ids::{page_id, span_id, warning_id};
43use ethos_core::model::{Page, Span, SpanOriginLocator, Warning};
44use ethos_core::traits::{BackendManifest, EthosPdfBackend, Extraction};
45use serde::{Deserialize, Serialize};
46
47pub const PDFIUM_LIBRARY_PATH_ENV: &str = "ETHOS_PDFIUM_LIBRARY_PATH";
49
50pub const PDFIUM_VERSION_ENV: &str = "ETHOS_PDFIUM_VERSION";
52
53pub const PDFIUM_ARTIFACT_PATH_ENV: &str = "ETHOS_PDFIUM_ARTIFACT_PATH";
55
56pub const QUANTUM_PER_POINT: u32 = 100;
58const ORIGIN_LOCATOR_POLICY: &str = "origin-run-locator-v1";
59
60const DETERMINISTIC_PROFILE_JSON: &str = include_str!("../assets/ethos-deterministic-v1.json");
61const FONT_SUBSTITUTION_TABLE_JSON: &str = include_str!("../assets/font-substitution-table.json");
62
63static PDFIUM_LOCK: Mutex<()> = Mutex::new(());
65static PINNED_PDFIUM_PROFILE: OnceLock<PinnedPdfiumBackend> = OnceLock::new();
66static FONT_SUBSTITUTION_TABLE: OnceLock<FontSubstitutionTable> = OnceLock::new();
67
68#[derive(Debug, Clone, Default)]
70pub struct PdfiumBackend {
71 library_path: Option<PathBuf>,
72 artifact_path: Option<PathBuf>,
73 version: Option<String>,
74}
75
76#[derive(Debug, Serialize)]
82pub struct GeometryProbeReport {
83 pub schema_version: String,
85 pub quantum_per_point: u32,
87 pub backend: BackendManifest,
89 pub pages: Vec<GeometryProbePage>,
91}
92
93#[derive(Debug, Serialize)]
95pub struct GeometryProbePage {
96 pub id: String,
98 pub index: u32,
100 pub width: i64,
102 pub height: i64,
104 pub rotation: u16,
106 pub char_count: i32,
108 pub symbols: GeometryProbeSymbols,
110 pub chars: Vec<GeometryProbeChar>,
112 pub runs: Vec<GeometryProbeRun>,
114}
115
116#[derive(Debug, Serialize)]
118pub struct GeometryProbeSymbols {
119 pub char_origin: bool,
121 pub loose_char_box: bool,
123 pub text_rects: bool,
125}
126
127#[derive(Debug, Serialize)]
129pub struct GeometryProbeChar {
130 pub index: i32,
132 pub unicode: u32,
134 pub text: Option<String>,
136 pub parser_action: String,
138 pub char_box: Option<QRect>,
140 pub loose_char_box: Option<QRect>,
142 pub char_origin: Option<[i64; 2]>,
144 pub font_id: Option<String>,
146 pub font_flags: Option<u32>,
148 pub font_size_q: Option<i64>,
150}
151
152#[derive(Debug, Serialize)]
154pub struct GeometryProbeRun {
155 pub index: u32,
157 pub text: String,
159 pub char_start: i32,
161 pub char_end: i32,
163 pub char_indices: Vec<i32>,
165 pub char_box_union: Option<QRect>,
167 pub loose_char_box_union: Option<QRect>,
169 pub text_rects: Vec<QRect>,
171 pub text_rect_union: Option<QRect>,
173 pub first_origin: Option<[i64; 2]>,
175 pub last_origin: Option<[i64; 2]>,
177 pub font_id: Option<String>,
179 pub font_flags: Option<u32>,
181 pub font_size_q: Option<i64>,
183}
184
185#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RawCrop {
192 pub page_index: u32,
194 pub bbox: QRect,
196 pub width_px: u32,
198 pub height_px: u32,
200 pub stride: u32,
202 pub pixel_format: &'static str,
204 pub sha256: String,
206 pub bytes: Vec<u8>,
208}
209
210impl PdfiumBackend {
211 pub fn from_library_path(path: impl Into<PathBuf>) -> Self {
213 PdfiumBackend {
214 library_path: Some(path.into()),
215 artifact_path: None,
216 version: None,
217 }
218 }
219
220 pub fn with_artifact_path(mut self, path: impl Into<PathBuf>) -> Self {
222 self.artifact_path = Some(path.into());
223 self
224 }
225
226 pub fn with_version(mut self, version: impl Into<String>) -> Self {
228 self.version = Some(version.into());
229 self
230 }
231
232 fn configured_library_path(&self) -> Option<PathBuf> {
233 self.library_path
234 .clone()
235 .or_else(|| env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from))
236 }
237
238 fn configured_artifact_path(&self) -> Option<PathBuf> {
239 self.artifact_path
240 .clone()
241 .or_else(|| env::var_os(PDFIUM_ARTIFACT_PATH_ENV).map(PathBuf::from))
242 }
243
244 fn configured_version_override(&self) -> Option<String> {
245 self.version
246 .clone()
247 .or_else(|| env::var(PDFIUM_VERSION_ENV).ok())
248 }
249
250 fn configured_version(&self) -> String {
251 self.configured_version_override()
252 .unwrap_or_else(|| pinned_pdfium_profile().version.clone())
253 }
254
255 pub fn geometry_probe(
261 &self,
262 pdf_bytes: &[u8],
263 config: &ParseConfig,
264 ) -> Result<GeometryProbeReport, EthosError> {
265 validate_pdf_header(pdf_bytes)?;
266 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
267 let runtime = PdfiumRuntime::load(self)?;
268 let doc = runtime.load_document(pdf_bytes)?;
269 let page_count = doc.page_count()?;
270 if page_count > config.limits.max_pages {
271 return Err(EthosError::new(
272 ErrorCode::PageLimitExceeded,
273 "page count exceeds configured limit",
274 ));
275 }
276 validate_page_selection(&config.pages, page_count)?;
277
278 let mut pages = Vec::new();
279 for page_index in 0..page_count {
280 let original_page = page_index + 1;
281 if !config.pages.contains(original_page) {
282 continue;
283 }
284 let page = doc.load_page(page_index)?;
285 pages.push(page.geometry_probe_page(original_page)?);
286 }
287
288 Ok(GeometryProbeReport {
289 schema_version: "ethos-pdfium-geometry-probe-v1".to_string(),
290 quantum_per_point: QUANTUM_PER_POINT,
291 backend: self.manifest(),
292 pages,
293 })
294 }
295
296 pub fn render_crop_raw(
302 &self,
303 pdf_bytes: &[u8],
304 page_index: u32,
305 bbox: QRect,
306 ) -> Result<RawCrop, EthosError> {
307 validate_pdf_header(pdf_bytes)?;
308 if page_index == 0 {
309 return Err(EthosError::new(
310 ErrorCode::PageLimitExceeded,
311 "page selection out of document range",
312 ));
313 }
314 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
315 let runtime = PdfiumRuntime::load(self)?;
316 let doc = runtime.load_document(pdf_bytes)?;
317 let page_count = doc.page_count()?;
318 if page_index > page_count {
319 return Err(EthosError::new(
320 ErrorCode::PageLimitExceeded,
321 "page selection out of document range",
322 ));
323 }
324 let page = doc.load_page(page_index - 1)?;
325 page.render_crop_raw(page_index, bbox)
326 }
327}
328
329impl EthosPdfBackend for PdfiumBackend {
330 fn manifest(&self) -> BackendManifest {
331 let platform_sha256 = self
332 .configured_library_path()
333 .and_then(|path| std::fs::read(path).ok())
334 .map(|bytes| ethos_core::c14n::sha256_hex_bytes(&bytes))
335 .unwrap_or_else(|| "0".repeat(64));
336 BackendManifest {
337 id: "pdfium".to_string(),
338 phase: 1,
339 version: self.configured_version(),
340 platform_sha256,
341 }
342 }
343
344 fn page_count(&self, pdf_bytes: &[u8]) -> Result<u32, EthosError> {
345 validate_pdf_header(pdf_bytes)?;
346 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
347 let runtime = PdfiumRuntime::load(self)?;
348 let doc = runtime.load_document(pdf_bytes)?;
349 doc.page_count()
350 }
351
352 fn extract(&self, pdf_bytes: &[u8], config: &ParseConfig) -> Result<Extraction, EthosError> {
353 validate_pdf_header(pdf_bytes)?;
354 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
355 let runtime = PdfiumRuntime::load(self)?;
356 let doc = runtime.load_document(pdf_bytes)?;
357 let page_count = doc.page_count()?;
358 if page_count > config.limits.max_pages {
359 return Err(EthosError::new(
360 ErrorCode::PageLimitExceeded,
361 "page count exceeds configured limit",
362 ));
363 }
364 validate_page_selection(&config.pages, page_count)?;
365
366 let mut pages = Vec::new();
367 let mut spans = Vec::new();
368 let mut warnings = Vec::new();
369 let mut next_span = 1u32;
370 let mut next_warning = 1u32;
371
372 for page_index in 0..page_count {
373 let original_page = page_index + 1;
374 if !config.pages.contains(original_page) {
375 continue;
376 }
377 let page = doc.load_page(page_index)?;
378 let page_model = page.model_page(original_page)?;
379 let span_count_before = spans.len();
380 page.extract_text_spans(&page_model, &mut next_span, &mut spans)?;
381 if spans.len() == span_count_before {
382 warnings.push(Warning {
383 id: warning_id(next_warning)?,
384 code: WarningCode::ImageOnlyPage,
385 message: "page has no extractable text; OCR is required for this page"
386 .to_string(),
387 page: Some(page_model.id.clone()),
388 element_ref: None,
389 span_ref: None,
390 region_ref: None,
391 });
392 next_warning += 1;
393 }
394 pages.push(page_model);
395 }
396
397 if spans.is_empty() {
398 return Err(EthosError::new(
399 ErrorCode::OcrRequired,
400 "no extractable text; OCR is required",
401 ));
402 }
403
404 Ok(Extraction {
405 pages,
406 spans,
407 regions: Vec::new(),
408 warnings,
409 })
410 }
411}
412
413fn validate_page_selection(selection: &PageSelection, page_count: u32) -> Result<(), EthosError> {
414 selection.validate_against(page_count).map_err(|_| {
415 EthosError::new(
416 ErrorCode::PageLimitExceeded,
417 "page selection out of document range",
418 )
419 })
420}
421
422fn validate_pdf_header(pdf_bytes: &[u8]) -> Result<(), EthosError> {
423 let window = &pdf_bytes[..pdf_bytes.len().min(1024)];
424 if window.windows(5).any(|w| w == b"%PDF-") {
425 Ok(())
426 } else {
427 Err(EthosError::new(
428 ErrorCode::InvalidPdf,
429 "input does not contain a PDF header",
430 ))
431 }
432}
433
434fn quantize_coord(value: f64) -> Result<i64, EthosError> {
435 quantize(value, QUANTUM_PER_POINT)
436 .map_err(|_| EthosError::new(ErrorCode::InternalError, "coordinate quantization failed"))
437}
438
439fn pixel_extent(points: f64) -> Result<u32, EthosError> {
440 if !points.is_finite() || points <= 0.0 {
441 return Err(EthosError::new(
442 ErrorCode::CorruptPdf,
443 "PDF page has invalid dimensions",
444 ));
445 }
446 if points.ceil() > f64::from(c_int::MAX) {
447 return Err(EthosError::internal("render bitmap dimension overflow"));
448 }
449 Ok(points.ceil() as u32)
450}
451
452fn floor_quantized_pixel(value: i64) -> i64 {
453 value.div_euclid(i64::from(QUANTUM_PER_POINT))
454}
455
456fn ceil_quantized_pixel(value: i64) -> i64 {
457 let quantum = i64::from(QUANTUM_PER_POINT);
458 value
459 .checked_add(quantum - 1)
460 .unwrap_or(i64::MAX)
461 .div_euclid(quantum)
462}
463
464fn clamp_pixel(value: i64, max: u32) -> u32 {
465 value.clamp(0, i64::from(max)) as u32
466}
467
468fn crop_window(
469 bbox: QRect,
470 page_width_px: u32,
471 page_height_px: u32,
472) -> Result<(u32, u32, u32, u32), EthosError> {
473 let x0 = clamp_pixel(floor_quantized_pixel(bbox.x0), page_width_px);
474 let y0 = clamp_pixel(floor_quantized_pixel(bbox.y0), page_height_px);
475 let x1 = clamp_pixel(ceil_quantized_pixel(bbox.x1), page_width_px);
476 let y1 = clamp_pixel(ceil_quantized_pixel(bbox.y1), page_height_px);
477 if x0 >= x1 || y0 >= y1 {
478 return Err(EthosError::internal(
479 "crop bbox has no positive pixel extent",
480 ));
481 }
482 Ok((x0, y0, x1 - x0, y1 - y0))
483}
484
485fn qrect_from_pdfium_char_box(
486 page_height_pts: f64,
487 left: f64,
488 right: f64,
489 bottom: f64,
490 top: f64,
491) -> Result<QRect, EthosError> {
492 let x0 = left.min(right);
493 let x1 = left.max(right);
494 let y0 = page_height_pts - top.max(bottom);
495 let y1 = page_height_pts - top.min(bottom);
496 QRect::new(
497 quantize_coord(x0)?,
498 quantize_coord(y0)?,
499 quantize_coord(x1)?,
500 quantize_coord(y1)?,
501 )
502 .map_err(|_| EthosError::internal("malformed character bbox"))
503}
504
505fn union_rect(a: QRect, b: QRect) -> QRect {
506 QRect {
507 x0: a.x0.min(b.x0),
508 y0: a.y0.min(b.y0),
509 x1: a.x1.max(b.x1),
510 y1: a.y1.max(b.y1),
511 }
512}
513
514fn map_pdfium_error(code: c_ulong) -> EthosError {
515 match code {
516 4 => EthosError::new(
517 ErrorCode::PasswordProtected,
518 "document is encrypted or password-protected",
519 ),
520 5 => EthosError::new(
521 ErrorCode::UnsupportedPdfFeature,
522 "document uses a restricted security handler",
523 ),
524 3 => EthosError::new(ErrorCode::CorruptPdf, "PDF structure is corrupt"),
525 6 => EthosError::new(ErrorCode::CorruptPdf, "PDF page tree is corrupt"),
526 2 => EthosError::new(ErrorCode::CorruptPdf, "PDF could not be loaded"),
527 _ => EthosError::new(ErrorCode::CorruptPdf, "PDFium could not load the document"),
528 }
529}
530
531#[derive(Debug, Deserialize)]
532struct DeterministicProfile {
533 backend: PinnedPdfiumBackend,
534}
535
536#[derive(Debug, Deserialize)]
537struct PinnedPdfiumBackend {
538 id: String,
539 phase: u8,
540 version: String,
541 upstream_version: String,
542 v8: String,
543 xfa: String,
544 distribution: PinnedPdfiumDistribution,
545 build_flags: PinnedPdfiumBuildFlags,
546 platform_hashes: BTreeMap<String, String>,
547 platform_artifacts: BTreeMap<String, PinnedPdfiumArtifact>,
548 profile_doc: String,
549}
550
551#[derive(Debug, Deserialize)]
552struct PinnedPdfiumDistribution {
553 source: String,
554 release_url: String,
555 published_at: String,
556 attestation: PinnedPdfiumAttestation,
557}
558
559#[derive(Debug, Deserialize)]
560struct PinnedPdfiumAttestation {
561 name: String,
562 sha256: String,
563}
564
565#[derive(Debug, Deserialize)]
566struct PinnedPdfiumBuildFlags {
567 is_component_build: bool,
568 is_debug: bool,
569 pdf_enable_v8: bool,
570 pdf_enable_xfa: bool,
571 pdf_is_standalone: bool,
572 pdf_use_partition_alloc: bool,
573}
574
575#[derive(Debug, Deserialize)]
576struct PinnedPdfiumArtifact {
577 name: String,
578 target_os: String,
579 target_cpu: String,
580 runtime_library_path: String,
581 runtime_library_sha256: String,
582}
583
584fn pinned_pdfium_profile() -> &'static PinnedPdfiumBackend {
585 PINNED_PDFIUM_PROFILE.get_or_init(|| {
586 let profile: DeterministicProfile = serde_json::from_str(DETERMINISTIC_PROFILE_JSON)
587 .expect("profiles/ethos-deterministic-v1.json is valid JSON");
588 validate_pinned_pdfium_profile(&profile.backend)
589 .expect("profiles/ethos-deterministic-v1.json pins a valid PDFium Phase 1 profile");
590 profile.backend
591 })
592}
593
594fn validate_pinned_pdfium_profile(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
595 validate_pinned_pdfium_identity(profile)?;
596 validate_pinned_pdfium_distribution(&profile.distribution)?;
597 validate_pinned_pdfium_build_flags(&profile.build_flags)?;
598 validate_pinned_pdfium_platforms(profile)?;
599 Ok(())
600}
601
602fn validate_pinned_pdfium_identity(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
603 if profile.id != "pdfium"
604 || profile.phase != 1
605 || profile.version != "chromium/7881"
606 || profile.upstream_version != "PDFium 151.0.7881.0"
607 || profile.v8 != "disabled"
608 || profile.xfa != "disabled"
609 || profile.profile_doc != "docs/pdfium-profile.md"
610 {
611 return Err("unexpected PDFium profile identity");
612 }
613 Ok(())
614}
615
616fn validate_pinned_pdfium_distribution(
617 distribution: &PinnedPdfiumDistribution,
618) -> Result<(), &'static str> {
619 if distribution.source != "bblanchon/pdfium-binaries"
620 || distribution.attestation.name != "pdfium-attestation.json"
621 || !is_sha256_hex(&distribution.attestation.sha256)
622 || !distribution
623 .release_url
624 .starts_with("https://github.com/bblanchon/pdfium-binaries/releases/tag/")
625 || !distribution.published_at.ends_with('Z')
626 {
627 return Err("unexpected PDFium distribution metadata");
628 }
629 Ok(())
630}
631
632fn validate_pinned_pdfium_build_flags(
633 build_flags: &PinnedPdfiumBuildFlags,
634) -> Result<(), &'static str> {
635 if build_flags.is_component_build
636 || build_flags.is_debug
637 || build_flags.pdf_enable_v8
638 || build_flags.pdf_enable_xfa
639 || !build_flags.pdf_is_standalone
640 || build_flags.pdf_use_partition_alloc
641 {
642 return Err("PDFium Phase 1 must be standalone release with V8/XFA disabled");
643 }
644 Ok(())
645}
646
647fn validate_pinned_pdfium_platforms(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
648 for platform in ["macos-arm64", "linux-x64", "windows-x64"] {
649 let artifact_hash = profile
650 .platform_hashes
651 .get(platform)
652 .ok_or("missing PDFium artifact hash")?;
653 if !is_sha256_hex(artifact_hash) {
654 return Err("malformed PDFium artifact hash");
655 }
656 let artifact = profile
657 .platform_artifacts
658 .get(platform)
659 .ok_or("missing PDFium platform artifact metadata")?;
660 if artifact.name.contains("-v8-")
661 || artifact.name.contains("xfa")
662 || !artifact.name.ends_with(".tgz")
663 || artifact.runtime_library_path.is_empty()
664 || !is_sha256_hex(&artifact.runtime_library_sha256)
665 {
666 return Err("malformed PDFium platform artifact metadata");
667 }
668 match platform {
669 "macos-arm64"
670 if artifact.name == "pdfium-mac-arm64.tgz"
671 && artifact.target_os == "mac"
672 && artifact.target_cpu == "arm64" => {}
673 "linux-x64"
674 if artifact.name == "pdfium-linux-x64.tgz"
675 && artifact.target_os == "linux"
676 && artifact.target_cpu == "x64" => {}
677 "windows-x64"
678 if artifact.name == "pdfium-win-x64.tgz"
679 && artifact.target_os == "win"
680 && artifact.target_cpu == "x64" => {}
681 _ => return Err("unexpected PDFium platform artifact"),
682 }
683 }
684 Ok(())
685}
686
687fn is_sha256_hex(value: &str) -> bool {
688 value.len() == 64
689 && value
690 .bytes()
691 .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase())
692}
693
694fn current_platform_key() -> Option<&'static str> {
695 if cfg!(all(target_os = "macos", target_arch = "aarch64")) {
696 Some("macos-arm64")
697 } else if cfg!(all(target_os = "linux", target_arch = "x86_64")) {
698 Some("linux-x64")
699 } else if cfg!(all(target_os = "windows", target_arch = "x86_64")) {
700 Some("windows-x64")
701 } else {
702 None
703 }
704}
705
706fn current_pdfium_pins(
707 profile: &PinnedPdfiumBackend,
708) -> Result<(&'static str, &str, &PinnedPdfiumArtifact), EthosError> {
709 let platform = current_platform_key().ok_or_else(|| {
710 EthosError::internal("pdfium phase 1 profile has no hash for this platform")
711 })?;
712 let artifact_hash = profile.platform_hashes.get(platform).ok_or_else(|| {
713 EthosError::internal("pdfium phase 1 profile has no hash for this platform")
714 })?;
715 let artifact = profile.platform_artifacts.get(platform).ok_or_else(|| {
716 EthosError::internal("pdfium phase 1 profile has no artifact for this platform")
717 })?;
718 Ok((platform, artifact_hash.as_str(), artifact))
719}
720
721fn validate_pinned_pdfium_payload(
722 backend: &PdfiumBackend,
723 library_path: &Path,
724) -> Result<(), EthosError> {
725 let profile = pinned_pdfium_profile();
726 if let Some(version) = backend.configured_version_override() {
727 let upstream_number = profile
728 .upstream_version
729 .strip_prefix("PDFium ")
730 .unwrap_or(&profile.upstream_version);
731 if version != profile.version
732 && version != profile.upstream_version
733 && version != upstream_number
734 {
735 return Err(EthosError::internal(
736 "pdfium version does not match pinned phase 1 profile",
737 ));
738 }
739 }
740
741 let (_, artifact_hash, artifact) = current_pdfium_pins(profile)?;
742 if let Some(artifact_path) = backend.configured_artifact_path() {
743 if !artifact_path.is_file() {
744 return Err(EthosError::internal(
745 "pdfium artifact path does not point to a file",
746 ));
747 }
748 let actual_artifact_hash = sha256_file(&artifact_path)?;
749 if actual_artifact_hash != artifact_hash {
750 return Err(EthosError::internal(
751 "pdfium artifact does not match pinned phase 1 profile",
752 ));
753 }
754 }
755
756 let library_hash = sha256_file(library_path)?;
757 if library_hash != artifact.runtime_library_sha256 {
758 return Err(EthosError::internal(
759 "pdfium library does not match pinned phase 1 profile",
760 ));
761 }
762
763 Ok(())
764}
765
766fn sha256_file(path: &Path) -> Result<String, EthosError> {
767 let bytes =
768 std::fs::read(path).map_err(|_| EthosError::internal("failed to read pdfium payload"))?;
769 Ok(ethos_core::c14n::sha256_hex_bytes(&bytes))
770}
771
772type FpdfDocument = *mut c_void;
773type FpdfPage = *mut c_void;
774type FpdfTextPage = *mut c_void;
775type FpdfBitmap = *mut c_void;
776
777#[cfg(not(windows))]
778type FpdfInitLibrary = unsafe extern "C" fn();
779#[cfg(windows)]
780type FpdfInitLibrary = unsafe extern "system" fn();
781#[cfg(not(windows))]
782type FpdfDestroyLibrary = unsafe extern "C" fn();
783#[cfg(windows)]
784type FpdfDestroyLibrary = unsafe extern "system" fn();
785#[cfg(not(windows))]
786type FpdfLoadMemDocument64 =
787 unsafe extern "C" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
788#[cfg(windows)]
789type FpdfLoadMemDocument64 =
790 unsafe extern "system" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
791#[cfg(not(windows))]
792type FpdfCloseDocument = unsafe extern "C" fn(FpdfDocument);
793#[cfg(windows)]
794type FpdfCloseDocument = unsafe extern "system" fn(FpdfDocument);
795#[cfg(not(windows))]
796type FpdfGetLastError = unsafe extern "C" fn() -> c_ulong;
797#[cfg(windows)]
798type FpdfGetLastError = unsafe extern "system" fn() -> c_ulong;
799#[cfg(not(windows))]
800type FpdfGetPageCount = unsafe extern "C" fn(FpdfDocument) -> c_int;
801#[cfg(windows)]
802type FpdfGetPageCount = unsafe extern "system" fn(FpdfDocument) -> c_int;
803#[cfg(not(windows))]
804type FpdfLoadPage = unsafe extern "C" fn(FpdfDocument, c_int) -> FpdfPage;
805#[cfg(windows)]
806type FpdfLoadPage = unsafe extern "system" fn(FpdfDocument, c_int) -> FpdfPage;
807#[cfg(not(windows))]
808type FpdfClosePage = unsafe extern "C" fn(FpdfPage);
809#[cfg(windows)]
810type FpdfClosePage = unsafe extern "system" fn(FpdfPage);
811#[cfg(not(windows))]
812type FpdfGetPageWidthF = unsafe extern "C" fn(FpdfPage) -> f32;
813#[cfg(windows)]
814type FpdfGetPageWidthF = unsafe extern "system" fn(FpdfPage) -> f32;
815#[cfg(not(windows))]
816type FpdfGetPageHeightF = unsafe extern "C" fn(FpdfPage) -> f32;
817#[cfg(windows)]
818type FpdfGetPageHeightF = unsafe extern "system" fn(FpdfPage) -> f32;
819#[cfg(not(windows))]
820type FpdfPageGetRotation = unsafe extern "C" fn(FpdfPage) -> c_int;
821#[cfg(windows)]
822type FpdfPageGetRotation = unsafe extern "system" fn(FpdfPage) -> c_int;
823#[cfg(not(windows))]
824type FpdfTextLoadPage = unsafe extern "C" fn(FpdfPage) -> FpdfTextPage;
825#[cfg(windows)]
826type FpdfTextLoadPage = unsafe extern "system" fn(FpdfPage) -> FpdfTextPage;
827#[cfg(not(windows))]
828type FpdfTextClosePage = unsafe extern "C" fn(FpdfTextPage);
829#[cfg(windows)]
830type FpdfTextClosePage = unsafe extern "system" fn(FpdfTextPage);
831#[cfg(not(windows))]
832type FpdfTextCountChars = unsafe extern "C" fn(FpdfTextPage) -> c_int;
833#[cfg(windows)]
834type FpdfTextCountChars = unsafe extern "system" fn(FpdfTextPage) -> c_int;
835#[cfg(not(windows))]
836type FpdfTextGetUnicode = unsafe extern "C" fn(FpdfTextPage, c_int) -> u32;
837#[cfg(windows)]
838type FpdfTextGetUnicode = unsafe extern "system" fn(FpdfTextPage, c_int) -> u32;
839#[cfg(not(windows))]
840type FpdfTextGetCharBox =
841 unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
842#[cfg(windows)]
843type FpdfTextGetCharBox =
844 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
845#[cfg(not(windows))]
846type FpdfTextGetLooseCharBox = unsafe extern "C" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
847#[cfg(windows)]
848type FpdfTextGetLooseCharBox =
849 unsafe extern "system" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
850#[cfg(not(windows))]
851type FpdfTextGetCharOrigin = unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
852#[cfg(windows)]
853type FpdfTextGetCharOrigin =
854 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
855#[cfg(not(windows))]
856type FpdfTextCountRects = unsafe extern "C" fn(FpdfTextPage, c_int, c_int) -> c_int;
857#[cfg(windows)]
858type FpdfTextCountRects = unsafe extern "system" fn(FpdfTextPage, c_int, c_int) -> c_int;
859#[cfg(not(windows))]
860type FpdfTextGetRect =
861 unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
862#[cfg(windows)]
863type FpdfTextGetRect =
864 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
865#[cfg(not(windows))]
866type FpdfTextGetFontSize = unsafe extern "C" fn(FpdfTextPage, c_int) -> f64;
867#[cfg(windows)]
868type FpdfTextGetFontSize = unsafe extern "system" fn(FpdfTextPage, c_int) -> f64;
869#[cfg(not(windows))]
870type FpdfTextGetFontInfo =
871 unsafe extern "C" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
872#[cfg(windows)]
873type FpdfTextGetFontInfo =
874 unsafe extern "system" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
875#[cfg(not(windows))]
876type FpdfTextIsGenerated = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
877#[cfg(windows)]
878type FpdfTextIsGenerated = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
879#[cfg(not(windows))]
880type FpdfTextIsHyphen = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
881#[cfg(windows)]
882type FpdfTextIsHyphen = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
883#[cfg(not(windows))]
884type FpdfBitmapCreate = unsafe extern "C" fn(c_int, c_int, c_int) -> FpdfBitmap;
885#[cfg(windows)]
886type FpdfBitmapCreate = unsafe extern "system" fn(c_int, c_int, c_int) -> FpdfBitmap;
887#[cfg(not(windows))]
888type FpdfBitmapDestroy = unsafe extern "C" fn(FpdfBitmap);
889#[cfg(windows)]
890type FpdfBitmapDestroy = unsafe extern "system" fn(FpdfBitmap);
891#[cfg(not(windows))]
892type FpdfBitmapFillRect = unsafe extern "C" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
893#[cfg(windows)]
894type FpdfBitmapFillRect =
895 unsafe extern "system" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
896#[cfg(not(windows))]
897type FpdfBitmapGetBuffer = unsafe extern "C" fn(FpdfBitmap) -> *mut c_void;
898#[cfg(windows)]
899type FpdfBitmapGetBuffer = unsafe extern "system" fn(FpdfBitmap) -> *mut c_void;
900#[cfg(not(windows))]
901type FpdfBitmapGetStride = unsafe extern "C" fn(FpdfBitmap) -> c_int;
902#[cfg(windows)]
903type FpdfBitmapGetStride = unsafe extern "system" fn(FpdfBitmap) -> c_int;
904#[cfg(not(windows))]
905type FpdfRenderPageBitmap =
906 unsafe extern "C" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
907#[cfg(windows)]
908type FpdfRenderPageBitmap =
909 unsafe extern "system" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
910
911#[repr(C)]
912#[derive(Clone, Copy, Debug, Default)]
913struct FsRectF {
914 left: f32,
915 top: f32,
916 right: f32,
917 bottom: f32,
918}
919
920#[derive(Clone, Copy)]
921struct PdfiumFunctions {
922 init_library: FpdfInitLibrary,
923 destroy_library: FpdfDestroyLibrary,
924 load_mem_document64: FpdfLoadMemDocument64,
925 close_document: FpdfCloseDocument,
926 get_last_error: FpdfGetLastError,
927 get_page_count: FpdfGetPageCount,
928 load_page: FpdfLoadPage,
929 close_page: FpdfClosePage,
930 get_page_width_f: FpdfGetPageWidthF,
931 get_page_height_f: FpdfGetPageHeightF,
932 page_get_rotation: Option<FpdfPageGetRotation>,
933 text_load_page: FpdfTextLoadPage,
934 text_close_page: FpdfTextClosePage,
935 text_count_chars: FpdfTextCountChars,
936 text_get_unicode: FpdfTextGetUnicode,
937 text_get_char_box: FpdfTextGetCharBox,
938 text_get_loose_char_box: Option<FpdfTextGetLooseCharBox>,
939 text_get_char_origin: Option<FpdfTextGetCharOrigin>,
940 text_count_rects: Option<FpdfTextCountRects>,
941 text_get_rect: Option<FpdfTextGetRect>,
942 text_get_font_size: FpdfTextGetFontSize,
943 text_get_font_info: Option<FpdfTextGetFontInfo>,
944 text_is_generated: Option<FpdfTextIsGenerated>,
945 text_is_hyphen: Option<FpdfTextIsHyphen>,
946 bitmap_create: Option<FpdfBitmapCreate>,
947 bitmap_destroy: Option<FpdfBitmapDestroy>,
948 bitmap_fill_rect: Option<FpdfBitmapFillRect>,
949 bitmap_get_buffer: Option<FpdfBitmapGetBuffer>,
950 bitmap_get_stride: Option<FpdfBitmapGetStride>,
951 render_page_bitmap: Option<FpdfRenderPageBitmap>,
952}
953
954impl PdfiumFunctions {
955 fn load(library: &dylib::Library) -> Result<Self, EthosError> {
956 unsafe {
959 Ok(PdfiumFunctions {
960 init_library: library.symbol(b"FPDF_InitLibrary\0")?,
961 destroy_library: library.symbol(b"FPDF_DestroyLibrary\0")?,
962 load_mem_document64: library.symbol(b"FPDF_LoadMemDocument64\0")?,
963 close_document: library.symbol(b"FPDF_CloseDocument\0")?,
964 get_last_error: library.symbol(b"FPDF_GetLastError\0")?,
965 get_page_count: library.symbol(b"FPDF_GetPageCount\0")?,
966 load_page: library.symbol(b"FPDF_LoadPage\0")?,
967 close_page: library.symbol(b"FPDF_ClosePage\0")?,
968 get_page_width_f: library.symbol(b"FPDF_GetPageWidthF\0")?,
969 get_page_height_f: library.symbol(b"FPDF_GetPageHeightF\0")?,
970 page_get_rotation: library.optional_symbol(b"FPDFPage_GetRotation\0"),
971 text_load_page: library.symbol(b"FPDFText_LoadPage\0")?,
972 text_close_page: library.symbol(b"FPDFText_ClosePage\0")?,
973 text_count_chars: library.symbol(b"FPDFText_CountChars\0")?,
974 text_get_unicode: library.symbol(b"FPDFText_GetUnicode\0")?,
975 text_get_char_box: library.symbol(b"FPDFText_GetCharBox\0")?,
976 text_get_loose_char_box: library.optional_symbol(b"FPDFText_GetLooseCharBox\0"),
977 text_get_char_origin: library.optional_symbol(b"FPDFText_GetCharOrigin\0"),
978 text_count_rects: library.optional_symbol(b"FPDFText_CountRects\0"),
979 text_get_rect: library.optional_symbol(b"FPDFText_GetRect\0"),
980 text_get_font_size: library.symbol(b"FPDFText_GetFontSize\0")?,
981 text_get_font_info: library.optional_symbol(b"FPDFText_GetFontInfo\0"),
982 text_is_generated: library.optional_symbol(b"FPDFText_IsGenerated\0"),
983 text_is_hyphen: library.optional_symbol(b"FPDFText_IsHyphen\0"),
984 bitmap_create: library.optional_symbol(b"FPDFBitmap_Create\0"),
985 bitmap_destroy: library.optional_symbol(b"FPDFBitmap_Destroy\0"),
986 bitmap_fill_rect: library.optional_symbol(b"FPDFBitmap_FillRect\0"),
987 bitmap_get_buffer: library.optional_symbol(b"FPDFBitmap_GetBuffer\0"),
988 bitmap_get_stride: library.optional_symbol(b"FPDFBitmap_GetStride\0"),
989 render_page_bitmap: library.optional_symbol(b"FPDF_RenderPageBitmap\0"),
990 })
991 }
992 }
993
994 fn geometry_probe_symbols(self) -> GeometryProbeSymbols {
995 GeometryProbeSymbols {
996 char_origin: self.text_get_char_origin.is_some(),
997 loose_char_box: self.text_get_loose_char_box.is_some(),
998 text_rects: self.text_count_rects.is_some() && self.text_get_rect.is_some(),
999 }
1000 }
1001}
1002
1003struct PdfiumRuntime {
1004 _library: dylib::Library,
1005 funcs: PdfiumFunctions,
1006 initialized: bool,
1007}
1008
1009impl PdfiumRuntime {
1010 fn load(backend: &PdfiumBackend) -> Result<Self, EthosError> {
1011 let path = backend.configured_library_path().ok_or_else(|| {
1012 EthosError::internal(format!(
1013 "pdfium library path is not configured; set {PDFIUM_LIBRARY_PATH_ENV}"
1014 ))
1015 })?;
1016 if !path.is_file() {
1017 return Err(EthosError::internal(
1018 "pdfium library path does not point to a file",
1019 ));
1020 }
1021 validate_pinned_pdfium_payload(backend, &path)?;
1022
1023 let library = dylib::Library::open(&path)?;
1024 let funcs = PdfiumFunctions::load(&library)?;
1025 unsafe { (funcs.init_library)() };
1028 Ok(PdfiumRuntime {
1029 _library: library,
1030 funcs,
1031 initialized: true,
1032 })
1033 }
1034
1035 fn load_document<'a>(&'a self, pdf_bytes: &[u8]) -> Result<PdfDocument<'a>, EthosError> {
1036 let handle = unsafe {
1039 (self.funcs.load_mem_document64)(
1040 pdf_bytes.as_ptr().cast(),
1041 pdf_bytes.len(),
1042 ptr::null(),
1043 )
1044 };
1045 if handle.is_null() {
1046 let code = unsafe { (self.funcs.get_last_error)() };
1048 Err(map_pdfium_error(code))
1049 } else {
1050 Ok(PdfDocument {
1051 funcs: &self.funcs,
1052 handle,
1053 })
1054 }
1055 }
1056}
1057
1058impl Drop for PdfiumRuntime {
1059 fn drop(&mut self) {
1060 if self.initialized {
1061 unsafe { (self.funcs.destroy_library)() };
1063 }
1064 }
1065}
1066
1067struct PdfDocument<'a> {
1068 funcs: &'a PdfiumFunctions,
1069 handle: FpdfDocument,
1070}
1071
1072impl PdfDocument<'_> {
1073 fn page_count(&self) -> Result<u32, EthosError> {
1074 let count = unsafe { (self.funcs.get_page_count)(self.handle) };
1076 if count <= 0 {
1077 return Err(EthosError::new(
1078 ErrorCode::CorruptPdf,
1079 "PDF has no readable pages",
1080 ));
1081 }
1082 u32::try_from(count).map_err(|_| EthosError::internal("page count overflow"))
1083 }
1084
1085 fn load_page(&self, page_index: u32) -> Result<PdfPage<'_>, EthosError> {
1086 let index =
1087 c_int::try_from(page_index).map_err(|_| EthosError::internal("page overflow"))?;
1088 let handle = unsafe { (self.funcs.load_page)(self.handle, index) };
1090 if handle.is_null() {
1091 let code = unsafe { (self.funcs.get_last_error)() };
1093 Err(map_pdfium_error(code))
1094 } else {
1095 Ok(PdfPage {
1096 funcs: self.funcs,
1097 handle,
1098 })
1099 }
1100 }
1101}
1102
1103impl Drop for PdfDocument<'_> {
1104 fn drop(&mut self) {
1105 unsafe { (self.funcs.close_document)(self.handle) };
1107 }
1108}
1109
1110struct PdfPage<'a> {
1111 funcs: &'a PdfiumFunctions,
1112 handle: FpdfPage,
1113}
1114
1115impl PdfPage<'_> {
1116 fn width_pts(&self) -> f64 {
1117 unsafe { (self.funcs.get_page_width_f)(self.handle) as f64 }
1119 }
1120
1121 fn height_pts(&self) -> f64 {
1122 unsafe { (self.funcs.get_page_height_f)(self.handle) as f64 }
1124 }
1125
1126 fn rotation(&self) -> u16 {
1127 let Some(get_rotation) = self.funcs.page_get_rotation else {
1128 return 0;
1129 };
1130 match unsafe { get_rotation(self.handle) }.rem_euclid(4) {
1132 1 => 90,
1133 2 => 180,
1134 3 => 270,
1135 _ => 0,
1136 }
1137 }
1138
1139 fn model_page(&self, original_page: u32) -> Result<Page, EthosError> {
1140 Ok(Page {
1141 id: page_id(original_page)?,
1142 index: original_page,
1143 width: quantize_coord(self.width_pts())?,
1144 height: quantize_coord(self.height_pts())?,
1145 rotation: self.rotation(),
1146 })
1147 }
1148
1149 fn geometry_probe_page(&self, original_page: u32) -> Result<GeometryProbePage, EthosError> {
1150 let page = self.model_page(original_page)?;
1151 let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1153 if text_handle.is_null() {
1154 return Ok(GeometryProbePage {
1155 id: page.id,
1156 index: page.index,
1157 width: page.width,
1158 height: page.height,
1159 rotation: page.rotation,
1160 char_count: 0,
1161 symbols: self.funcs.geometry_probe_symbols(),
1162 chars: Vec::new(),
1163 runs: Vec::new(),
1164 });
1165 }
1166 let text_page = PdfTextPage {
1167 funcs: self.funcs,
1168 handle: text_handle,
1169 };
1170 text_page.geometry_probe(&page, self.height_pts())
1171 }
1172
1173 fn extract_text_spans(
1174 &self,
1175 page: &Page,
1176 next_span: &mut u32,
1177 spans: &mut Vec<Span>,
1178 ) -> Result<(), EthosError> {
1179 let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1181 if text_handle.is_null() {
1182 return Ok(());
1183 }
1184 let text_page = PdfTextPage {
1185 funcs: self.funcs,
1186 handle: text_handle,
1187 };
1188 text_page.extract_runs(page, self.height_pts(), next_span, spans)
1189 }
1190
1191 fn render_crop_raw(&self, page_index: u32, bbox: QRect) -> Result<RawCrop, EthosError> {
1192 let bitmap = RenderBitmap::render_page(
1193 self.funcs,
1194 self.handle,
1195 pixel_extent(self.width_pts())?,
1196 pixel_extent(self.height_pts())?,
1197 )?;
1198 let (x0, y0, width_px, height_px) = crop_window(bbox, bitmap.width_px, bitmap.height_px)?;
1199 let bytes = bitmap.crop_bytes(x0, y0, width_px, height_px)?;
1200 Ok(RawCrop {
1201 page_index,
1202 bbox,
1203 width_px,
1204 height_px,
1205 stride: width_px
1206 .checked_mul(4)
1207 .ok_or_else(|| EthosError::internal("crop stride overflow"))?,
1208 pixel_format: "bgra_8u",
1209 sha256: ethos_core::c14n::sha256_hex_bytes(&bytes),
1210 bytes,
1211 })
1212 }
1213}
1214
1215impl Drop for PdfPage<'_> {
1216 fn drop(&mut self) {
1217 unsafe { (self.funcs.close_page)(self.handle) };
1219 }
1220}
1221
1222struct PdfTextPage<'a> {
1223 funcs: &'a PdfiumFunctions,
1224 handle: FpdfTextPage,
1225}
1226
1227struct RenderBitmap<'a> {
1228 funcs: &'a PdfiumFunctions,
1229 handle: FpdfBitmap,
1230 width_px: u32,
1231 height_px: u32,
1232 stride: usize,
1233}
1234
1235impl RenderBitmap<'_> {
1236 fn render_page(
1237 funcs: &PdfiumFunctions,
1238 page: FpdfPage,
1239 width_px: u32,
1240 height_px: u32,
1241 ) -> Result<RenderBitmap<'_>, EthosError> {
1242 let Some(bitmap_create) = funcs.bitmap_create else {
1243 return Err(EthosError::internal(
1244 "pdfium library is missing bitmap render symbols",
1245 ));
1246 };
1247 let Some(bitmap_fill_rect) = funcs.bitmap_fill_rect else {
1248 return Err(EthosError::internal(
1249 "pdfium library is missing bitmap render symbols",
1250 ));
1251 };
1252 let Some(render_page_bitmap) = funcs.render_page_bitmap else {
1253 return Err(EthosError::internal(
1254 "pdfium library is missing bitmap render symbols",
1255 ));
1256 };
1257 let width = c_int::try_from(width_px)
1258 .map_err(|_| EthosError::internal("render bitmap width overflow"))?;
1259 let height = c_int::try_from(height_px)
1260 .map_err(|_| EthosError::internal("render bitmap height overflow"))?;
1261
1262 let handle = unsafe { bitmap_create(width, height, 1) };
1264 if handle.is_null() {
1265 return Err(EthosError::internal(
1266 "pdfium failed to allocate render bitmap",
1267 ));
1268 }
1269 let mut bitmap = RenderBitmap {
1270 funcs,
1271 handle,
1272 width_px,
1273 height_px,
1274 stride: 0,
1275 };
1276 unsafe { bitmap_fill_rect(bitmap.handle, 0, 0, width, height, 0xFFFF_FFFF) };
1278 unsafe { render_page_bitmap(bitmap.handle, page, 0, 0, width, height, 0, 0) };
1280 bitmap.stride = bitmap.read_stride()?;
1281 Ok(bitmap)
1282 }
1283
1284 fn read_stride(&self) -> Result<usize, EthosError> {
1285 let Some(bitmap_get_stride) = self.funcs.bitmap_get_stride else {
1286 return Err(EthosError::internal(
1287 "pdfium library is missing bitmap render symbols",
1288 ));
1289 };
1290 let stride = unsafe { bitmap_get_stride(self.handle) };
1292 if stride <= 0 {
1293 return Err(EthosError::internal(
1294 "pdfium render bitmap has invalid stride",
1295 ));
1296 }
1297 usize::try_from(stride).map_err(|_| EthosError::internal("render bitmap stride overflow"))
1298 }
1299
1300 fn crop_bytes(
1301 &self,
1302 x0: u32,
1303 y0: u32,
1304 width_px: u32,
1305 height_px: u32,
1306 ) -> Result<Vec<u8>, EthosError> {
1307 let Some(bitmap_get_buffer) = self.funcs.bitmap_get_buffer else {
1308 return Err(EthosError::internal(
1309 "pdfium library is missing bitmap render symbols",
1310 ));
1311 };
1312 let ptr = unsafe { bitmap_get_buffer(self.handle) };
1314 if ptr.is_null() {
1315 return Err(EthosError::internal("pdfium render bitmap has null buffer"));
1316 }
1317 let full_len = self
1318 .stride
1319 .checked_mul(
1320 usize::try_from(self.height_px)
1321 .map_err(|_| EthosError::internal("render bitmap height overflow"))?,
1322 )
1323 .ok_or_else(|| EthosError::internal("render bitmap buffer length overflow"))?;
1324 let full = unsafe { slice::from_raw_parts(ptr.cast::<u8>(), full_len) };
1326
1327 let x0 = usize::try_from(x0).map_err(|_| EthosError::internal("crop x overflow"))?;
1328 let y0 = usize::try_from(y0).map_err(|_| EthosError::internal("crop y overflow"))?;
1329 let width =
1330 usize::try_from(width_px).map_err(|_| EthosError::internal("crop width overflow"))?;
1331 let height =
1332 usize::try_from(height_px).map_err(|_| EthosError::internal("crop height overflow"))?;
1333 let row_bytes = width
1334 .checked_mul(4)
1335 .ok_or_else(|| EthosError::internal("crop row width overflow"))?;
1336 let mut out = Vec::with_capacity(
1337 row_bytes
1338 .checked_mul(height)
1339 .ok_or_else(|| EthosError::internal("crop buffer length overflow"))?,
1340 );
1341 for row in 0..height {
1342 let src_start = y0
1343 .checked_add(row)
1344 .and_then(|y| y.checked_mul(self.stride))
1345 .and_then(|base| base.checked_add(x0.checked_mul(4)?))
1346 .ok_or_else(|| EthosError::internal("crop source offset overflow"))?;
1347 let src_end = src_start
1348 .checked_add(row_bytes)
1349 .ok_or_else(|| EthosError::internal("crop source row overflow"))?;
1350 if src_end > full.len() {
1351 return Err(EthosError::internal(
1352 "crop source row exceeds render bitmap",
1353 ));
1354 }
1355 out.extend_from_slice(&full[src_start..src_end]);
1356 }
1357 Ok(out)
1358 }
1359}
1360
1361impl Drop for RenderBitmap<'_> {
1362 fn drop(&mut self) {
1363 if let Some(bitmap_destroy) = self.funcs.bitmap_destroy {
1364 unsafe { bitmap_destroy(self.handle) };
1366 }
1367 }
1368}
1369
1370impl PdfTextPage<'_> {
1371 fn geometry_probe(
1372 &self,
1373 page: &Page,
1374 page_height_pts: f64,
1375 ) -> Result<GeometryProbePage, EthosError> {
1376 let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1378 if count < 0 {
1379 return Err(EthosError::new(
1380 ErrorCode::CorruptPdf,
1381 "PDF text page could not be read",
1382 ));
1383 }
1384
1385 let mut chars = Vec::new();
1386 let mut run = GeometryRunBuilder::default();
1387 let mut runs = Vec::new();
1388 let mut next_run = 1u32;
1389 for index in 0..count {
1390 let record = self.geometry_probe_char(index, page_height_pts)?;
1391 match record.parser_action.as_str() {
1392 "include" => {
1393 if run.has_style_change(&record.font_id, record.font_size_q, record.font_flags)
1394 {
1395 run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1396 }
1397 run.push(&record);
1398 }
1399 "skip_generated_hyphen" => {}
1400 _ => run.flush(self, page_height_pts, &mut next_run, &mut runs)?,
1401 }
1402 chars.push(record);
1403 }
1404 run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1405
1406 Ok(GeometryProbePage {
1407 id: page.id.clone(),
1408 index: page.index,
1409 width: page.width,
1410 height: page.height,
1411 rotation: page.rotation,
1412 char_count: count,
1413 symbols: self.funcs.geometry_probe_symbols(),
1414 chars,
1415 runs,
1416 })
1417 }
1418
1419 fn geometry_probe_char(
1420 &self,
1421 index: c_int,
1422 page_height_pts: f64,
1423 ) -> Result<GeometryProbeChar, EthosError> {
1424 let unicode = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1426 let ch = char::from_u32(unicode);
1427 let parser_action = match ch {
1428 None => "break_invalid_unicode",
1429 Some(_) if self.is_generated_hyphen(index) => "skip_generated_hyphen",
1430 Some(ch) if should_break_text_run(ch) => "break_whitespace_or_control",
1431 Some(_) => "include",
1432 };
1433
1434 let font_info = self.font_info(index);
1435 Ok(GeometryProbeChar {
1436 index,
1437 unicode,
1438 text: ch.map(|ch| ch.to_string()),
1439 parser_action: parser_action.to_string(),
1440 char_box: self.char_bbox(index, page_height_pts)?,
1441 loose_char_box: self.loose_char_bbox(index, page_height_pts)?,
1442 char_origin: self.char_origin(index, page_height_pts)?,
1443 font_id: font_info.font_id,
1444 font_flags: font_info.font_flags,
1445 font_size_q: self.font_size_q(index),
1446 })
1447 }
1448
1449 fn extract_runs(
1450 &self,
1451 page: &Page,
1452 page_height_pts: f64,
1453 next_span: &mut u32,
1454 spans: &mut Vec<Span>,
1455 ) -> Result<(), EthosError> {
1456 let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1458 if count < 0 {
1459 return Err(EthosError::new(
1462 ErrorCode::CorruptPdf,
1463 "PDF text page could not be read",
1464 ));
1465 }
1466 if count == 0 {
1467 return Ok(());
1468 }
1469
1470 let mut run = SpanRun::default();
1471 for index in 0..count {
1472 let codepoint = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1474 let Some(ch) = char::from_u32(codepoint) else {
1475 run.flush(page, next_span, spans)?;
1476 continue;
1477 };
1478 if self.is_generated_hyphen(index) {
1479 continue;
1480 }
1481 if should_break_text_run(ch) {
1482 run.flush(page, next_span, spans)?;
1483 continue;
1484 }
1485
1486 let Some(bbox) = self.char_bbox(index, page_height_pts)? else {
1487 run.flush(page, next_span, spans)?;
1488 continue;
1489 };
1490 let font_size_q = self.font_size_q(index);
1491 let font_info = self.font_info(index);
1492 if run.has_style_change(&font_info.font_id, font_size_q) {
1493 run.flush(page, next_span, spans)?;
1494 }
1495 let origin = self.char_origin(index, page_height_pts)?;
1496 run.push(ch, bbox, origin, font_info.font_id, font_size_q);
1497 }
1498 run.flush(page, next_span, spans)
1499 }
1500
1501 fn char_bbox(&self, index: c_int, page_height_pts: f64) -> Result<Option<QRect>, EthosError> {
1502 let mut left = 0.0f64;
1503 let mut right = 0.0f64;
1504 let mut bottom = 0.0f64;
1505 let mut top = 0.0f64;
1506 let ok = unsafe {
1508 (self.funcs.text_get_char_box)(
1509 self.handle,
1510 index,
1511 &mut left,
1512 &mut right,
1513 &mut bottom,
1514 &mut top,
1515 )
1516 };
1517 if ok == 0 {
1518 return Ok(None);
1519 }
1520 Ok(Some(qrect_from_pdfium_char_box(
1521 page_height_pts,
1522 left,
1523 right,
1524 bottom,
1525 top,
1526 )?))
1527 }
1528
1529 fn loose_char_bbox(
1530 &self,
1531 index: c_int,
1532 page_height_pts: f64,
1533 ) -> Result<Option<QRect>, EthosError> {
1534 let Some(get_loose_char_box) = self.funcs.text_get_loose_char_box else {
1535 return Ok(None);
1536 };
1537 let mut rect = FsRectF::default();
1538 let ok = unsafe { get_loose_char_box(self.handle, index, &mut rect) };
1540 if ok == 0 {
1541 return Ok(None);
1542 }
1543 Ok(Some(qrect_from_pdfium_char_box(
1544 page_height_pts,
1545 f64::from(rect.left),
1546 f64::from(rect.right),
1547 f64::from(rect.bottom),
1548 f64::from(rect.top),
1549 )?))
1550 }
1551
1552 fn char_origin(
1553 &self,
1554 index: c_int,
1555 page_height_pts: f64,
1556 ) -> Result<Option<[i64; 2]>, EthosError> {
1557 let Some(get_char_origin) = self.funcs.text_get_char_origin else {
1558 return Ok(None);
1559 };
1560 let mut x = 0.0f64;
1561 let mut y = 0.0f64;
1562 let ok = unsafe { get_char_origin(self.handle, index, &mut x, &mut y) };
1564 if ok == 0 {
1565 return Ok(None);
1566 }
1567 Ok(Some([
1568 quantize_coord(x)?,
1569 quantize_coord(page_height_pts - y)?,
1570 ]))
1571 }
1572
1573 fn text_rects(
1574 &self,
1575 char_start: c_int,
1576 char_count: c_int,
1577 page_height_pts: f64,
1578 ) -> Result<Vec<QRect>, EthosError> {
1579 let (Some(count_rects), Some(get_rect)) =
1580 (self.funcs.text_count_rects, self.funcs.text_get_rect)
1581 else {
1582 return Ok(Vec::new());
1583 };
1584 if char_count <= 0 {
1585 return Ok(Vec::new());
1586 }
1587 let rect_count = unsafe { count_rects(self.handle, char_start, char_count) };
1589 if rect_count <= 0 {
1590 return Ok(Vec::new());
1591 }
1592 let mut rects = Vec::new();
1593 for rect_index in 0..rect_count {
1594 let mut left = 0.0f64;
1595 let mut top = 0.0f64;
1596 let mut right = 0.0f64;
1597 let mut bottom = 0.0f64;
1598 let ok = unsafe {
1600 get_rect(
1601 self.handle,
1602 rect_index,
1603 &mut left,
1604 &mut top,
1605 &mut right,
1606 &mut bottom,
1607 )
1608 };
1609 if ok != 0 {
1610 rects.push(qrect_from_pdfium_char_box(
1611 page_height_pts,
1612 left,
1613 right,
1614 bottom,
1615 top,
1616 )?);
1617 }
1618 }
1619 Ok(rects)
1620 }
1621
1622 fn font_size_q(&self, index: c_int) -> Option<i64> {
1623 let size = unsafe { (self.funcs.text_get_font_size)(self.handle, index) };
1625 if size <= 0.0 {
1626 return None;
1627 }
1628 quantize(size, QUANTUM_PER_POINT).ok()
1629 }
1630
1631 fn font_info(&self, index: c_int) -> PdfFontInfo {
1632 let Some(get_font_info) = self.funcs.text_get_font_info else {
1633 return PdfFontInfo::default();
1634 };
1635 let len =
1637 unsafe { (get_font_info)(self.handle, index, ptr::null_mut(), 0, ptr::null_mut()) };
1638 if len == 0 || len > 4096 {
1639 return PdfFontInfo::default();
1640 }
1641
1642 let Ok(len_usize) = usize::try_from(len) else {
1643 return PdfFontInfo::default();
1644 };
1645 let mut buffer = vec![0u8; len_usize];
1646 let mut flags = 0;
1647 let written = unsafe {
1649 (get_font_info)(
1650 self.handle,
1651 index,
1652 buffer.as_mut_ptr().cast(),
1653 len,
1654 &mut flags,
1655 )
1656 };
1657 if written == 0 || written > len {
1658 return PdfFontInfo::default();
1659 }
1660 let nul = buffer.iter().position(|b| *b == 0).unwrap_or(buffer.len());
1661 let raw = std::str::from_utf8(&buffer[..nul]).ok();
1662 PdfFontInfo {
1663 font_id: raw.and_then(deterministic_font_id),
1664 font_flags: u32::try_from(flags).ok(),
1665 }
1666 }
1667
1668 fn is_generated_hyphen(&self, index: c_int) -> bool {
1669 let (Some(text_is_generated), Some(text_is_hyphen)) =
1670 (self.funcs.text_is_generated, self.funcs.text_is_hyphen)
1671 else {
1672 return false;
1673 };
1674 unsafe {
1676 text_is_generated(self.handle, index) == 1 && text_is_hyphen(self.handle, index) == 1
1677 }
1678 }
1679}
1680
1681impl Drop for PdfTextPage<'_> {
1682 fn drop(&mut self) {
1683 unsafe { (self.funcs.text_close_page)(self.handle) };
1685 }
1686}
1687
1688fn should_break_text_run(ch: char) -> bool {
1689 ch == '\0' || ch.is_whitespace() || ch.is_control()
1690}
1691
1692#[derive(Default)]
1693struct SpanRun {
1694 text: String,
1695 bbox: Option<QRect>,
1696 first_origin: Option<[i64; 2]>,
1697 last_origin: Option<[i64; 2]>,
1698 font_id: Option<String>,
1699 font_size_q: Option<i64>,
1700}
1701
1702#[derive(Default)]
1703struct GeometryRunBuilder {
1704 text: String,
1705 char_indices: Vec<i32>,
1706 char_box_union: Option<QRect>,
1707 loose_char_box_union: Option<QRect>,
1708 first_origin: Option<[i64; 2]>,
1709 last_origin: Option<[i64; 2]>,
1710 font_id: Option<String>,
1711 font_size_q: Option<i64>,
1712 font_flags: Option<u32>,
1713}
1714
1715#[derive(Default)]
1716struct PdfFontInfo {
1717 font_id: Option<String>,
1718 font_flags: Option<u32>,
1719}
1720
1721#[derive(Debug, Deserialize)]
1722struct FontSubstitutionTable {
1723 schema_version: String,
1724 table_id: String,
1725 version: String,
1726 default_unresolved_font_id: String,
1727 mappings: Vec<FontSubstitutionMapping>,
1728}
1729
1730#[derive(Debug, Deserialize)]
1731struct FontSubstitutionMapping {
1732 source: String,
1733 font_id: String,
1734}
1735
1736impl SpanRun {
1737 fn has_style_change(&self, font_id: &Option<String>, font_size_q: Option<i64>) -> bool {
1738 !self.text.is_empty() && (self.font_id != *font_id || self.font_size_q != font_size_q)
1739 }
1740
1741 fn push(
1742 &mut self,
1743 ch: char,
1744 bbox: QRect,
1745 origin: Option<[i64; 2]>,
1746 font_id: Option<String>,
1747 font_size_q: Option<i64>,
1748 ) {
1749 self.text.push(ch);
1750 self.bbox = Some(match self.bbox {
1751 Some(existing) => union_rect(existing, bbox),
1752 None => bbox,
1753 });
1754 if self.first_origin.is_none() {
1755 self.first_origin = origin;
1756 }
1757 self.last_origin = origin;
1758 if self.font_id.is_none() {
1759 self.font_id = font_id;
1760 }
1761 if self.font_size_q.is_none() {
1762 self.font_size_q = font_size_q;
1763 }
1764 }
1765
1766 fn flush(
1767 &mut self,
1768 page: &Page,
1769 next_span: &mut u32,
1770 spans: &mut Vec<Span>,
1771 ) -> Result<(), EthosError> {
1772 if self.text.is_empty() {
1773 return Ok(());
1774 }
1775 let bbox = self
1776 .bbox
1777 .ok_or_else(|| EthosError::internal("span run has text without bbox"))?;
1778 let origin_locator = match (self.first_origin.take(), self.last_origin.take()) {
1779 (Some(first_origin), Some(last_origin)) => Some(SpanOriginLocator {
1780 policy: ORIGIN_LOCATOR_POLICY.to_string(),
1781 first_origin,
1782 last_origin,
1783 }),
1784 _ => None,
1785 };
1786 spans.push(Span {
1787 id: span_id(*next_span)?,
1788 page: page.id.clone(),
1789 bbox,
1790 origin_locator,
1791 text: std::mem::take(&mut self.text),
1792 font_id: self.font_id.take(),
1793 font_size_q: self.font_size_q,
1794 char_start: None,
1795 char_end: None,
1796 warning_refs: Vec::new(),
1797 });
1798 *next_span += 1;
1799 self.bbox = None;
1800 self.first_origin = None;
1801 self.last_origin = None;
1802 self.font_id = None;
1803 self.font_size_q = None;
1804 Ok(())
1805 }
1806}
1807
1808impl GeometryRunBuilder {
1809 fn has_style_change(
1810 &self,
1811 font_id: &Option<String>,
1812 font_size_q: Option<i64>,
1813 font_flags: Option<u32>,
1814 ) -> bool {
1815 !self.text.is_empty()
1816 && (self.font_id != *font_id
1817 || self.font_size_q != font_size_q
1818 || self.font_flags != font_flags)
1819 }
1820
1821 fn push(&mut self, ch: &GeometryProbeChar) {
1822 if let Some(text) = &ch.text {
1823 self.text.push_str(text);
1824 }
1825 self.char_indices.push(ch.index);
1826 self.char_box_union = union_option_rect(self.char_box_union, ch.char_box);
1827 self.loose_char_box_union = union_option_rect(self.loose_char_box_union, ch.loose_char_box);
1828 if self.first_origin.is_none() {
1829 self.first_origin = ch.char_origin;
1830 }
1831 self.last_origin = ch.char_origin;
1832 if self.font_id.is_none() {
1833 self.font_id = ch.font_id.clone();
1834 }
1835 if self.font_size_q.is_none() {
1836 self.font_size_q = ch.font_size_q;
1837 }
1838 if self.font_flags.is_none() {
1839 self.font_flags = ch.font_flags;
1840 }
1841 }
1842
1843 fn flush(
1844 &mut self,
1845 text_page: &PdfTextPage<'_>,
1846 page_height_pts: f64,
1847 next_run: &mut u32,
1848 runs: &mut Vec<GeometryProbeRun>,
1849 ) -> Result<(), EthosError> {
1850 if self.text.is_empty() {
1851 return Ok(());
1852 }
1853 let char_start = self.char_indices.first().copied().unwrap_or_default();
1854 let char_end = self
1855 .char_indices
1856 .last()
1857 .copied()
1858 .map(|index| index + 1)
1859 .unwrap_or(char_start);
1860 let text_rects =
1861 text_page.text_rects(char_start, char_end - char_start, page_height_pts)?;
1862 runs.push(GeometryProbeRun {
1863 index: *next_run,
1864 text: std::mem::take(&mut self.text),
1865 char_start,
1866 char_end,
1867 char_indices: std::mem::take(&mut self.char_indices),
1868 char_box_union: self.char_box_union.take(),
1869 loose_char_box_union: self.loose_char_box_union.take(),
1870 text_rect_union: union_rects(text_rects.iter().copied()),
1871 text_rects,
1872 first_origin: self.first_origin.take(),
1873 last_origin: self.last_origin.take(),
1874 font_id: self.font_id.take(),
1875 font_flags: self.font_flags.take(),
1876 font_size_q: self.font_size_q.take(),
1877 });
1878 *next_run += 1;
1879 self.font_size_q = None;
1880 self.font_flags = None;
1881 Ok(())
1882 }
1883}
1884
1885fn union_option_rect(existing: Option<QRect>, next: Option<QRect>) -> Option<QRect> {
1886 match (existing, next) {
1887 (Some(a), Some(b)) => Some(union_rect(a, b)),
1888 (Some(a), None) => Some(a),
1889 (None, Some(b)) => Some(b),
1890 (None, None) => None,
1891 }
1892}
1893
1894fn union_rects(mut rects: impl Iterator<Item = QRect>) -> Option<QRect> {
1895 let first = rects.next()?;
1896 Some(rects.fold(first, union_rect))
1897}
1898
1899fn deterministic_font_id(raw_name: &str) -> Option<String> {
1900 let raw_name = raw_name.trim();
1901 if raw_name.is_empty() {
1902 return None;
1903 }
1904 let (name, subset) = strip_subset_prefix(raw_name);
1905 if subset {
1906 if let Some(normalized) = normalize_font_name(name) {
1907 if is_safe_font_id_suffix(&normalized) {
1908 return Some(format!("embedded:{normalized}"));
1909 }
1910 }
1911 return Some(hashed_embedded_font_id(name));
1912 }
1913 let normalized = normalize_font_name(name)?;
1914 font_substitution(&normalized)
1915 .or_else(|| Some(font_substitution_table().default_unresolved_font_id.clone()))
1916}
1917
1918fn strip_subset_prefix(name: &str) -> (&str, bool) {
1919 let bytes = name.as_bytes();
1920 if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(u8::is_ascii_uppercase) {
1921 (&name[7..], true)
1922 } else {
1923 (name, false)
1924 }
1925}
1926
1927fn normalize_font_name(name: &str) -> Option<String> {
1928 let mut out = String::new();
1929 let mut previous_dash = false;
1930 for ch in name.trim().chars() {
1931 let mapped = if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
1932 ch
1933 } else if ch.is_whitespace()
1934 || ch.is_control()
1935 || matches!(ch, '/' | '\\' | ':' | ',' | '(' | ')' | '[' | ']')
1936 {
1937 '-'
1938 } else {
1939 ch
1940 };
1941 if mapped == '-' {
1942 if previous_dash {
1943 continue;
1944 }
1945 previous_dash = true;
1946 } else {
1947 previous_dash = false;
1948 }
1949 out.push(mapped);
1950 }
1951 let out = out.trim_matches('-').to_string();
1952 (!out.is_empty()).then_some(out)
1953}
1954
1955fn is_safe_font_id_suffix(name: &str) -> bool {
1956 !name.is_empty()
1957 && name
1958 .bytes()
1959 .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b'.'))
1960}
1961
1962fn hashed_embedded_font_id(name: &str) -> String {
1963 format!(
1964 "embedded:sha256-{}",
1965 ethos_core::c14n::sha256_hex_bytes(name.as_bytes())
1966 )
1967}
1968
1969fn font_substitution(name: &str) -> Option<String> {
1970 font_substitution_table()
1971 .mappings
1972 .iter()
1973 .find(|mapping| mapping.source == name)
1974 .map(|mapping| mapping.font_id.clone())
1975}
1976
1977fn font_substitution_table() -> &'static FontSubstitutionTable {
1978 FONT_SUBSTITUTION_TABLE.get_or_init(|| {
1979 let table: FontSubstitutionTable = serde_json::from_str(FONT_SUBSTITUTION_TABLE_JSON)
1980 .expect("bundled font-substitution-table.json is valid JSON");
1981 validate_font_substitution_table(&table)
1982 .expect("bundled font-substitution-table.json is internally valid");
1983 table
1984 })
1985}
1986
1987fn validate_font_substitution_table(table: &FontSubstitutionTable) -> Result<(), &'static str> {
1988 if table.schema_version != "1.0.0"
1989 || table.table_id != "ethos-font-substitution-v1"
1990 || table.version != "1.0.0"
1991 || table.default_unresolved_font_id != "subst:liberation-sans-regular"
1992 {
1993 return Err("unexpected font substitution table metadata");
1994 }
1995
1996 let mut seen = HashSet::new();
1997 for mapping in &table.mappings {
1998 if mapping.source.is_empty() || !mapping.font_id.starts_with("subst:") {
1999 return Err("malformed font substitution mapping");
2000 }
2001 if !seen.insert(mapping.source.as_str()) {
2002 return Err("duplicate font substitution mapping source");
2003 }
2004 }
2005
2006 Ok(())
2007}
2008
2009#[cfg(unix)]
2010mod dylib {
2011 use super::*;
2012 use std::os::unix::ffi::OsStrExt;
2013
2014 const RTLD_NOW: c_int = 2;
2015
2016 unsafe extern "C" {
2017 fn dlopen(filename: *const c_char, flag: c_int) -> *mut c_void;
2018 fn dlsym(handle: *mut c_void, symbol: *const c_char) -> *mut c_void;
2019 fn dlclose(handle: *mut c_void) -> c_int;
2020 }
2021
2022 pub(super) struct Library {
2023 handle: *mut c_void,
2024 }
2025
2026 impl Library {
2027 pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2028 let c_path = CString::new(path.as_os_str().as_bytes()).map_err(|_| {
2029 EthosError::internal("pdfium library path contains an interior NUL byte")
2030 })?;
2031 let handle = unsafe { dlopen(c_path.as_ptr(), RTLD_NOW) };
2033 if handle.is_null() {
2034 Err(EthosError::internal(
2035 "failed to load configured pdfium library",
2036 ))
2037 } else {
2038 Ok(Library { handle })
2039 }
2040 }
2041
2042 pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2043 let ptr = self.symbol_ptr(name);
2044 if ptr.is_null() {
2045 return Err(EthosError::internal(format!(
2046 "pdfium library is missing symbol {}",
2047 symbol_name(name)
2048 )));
2049 }
2050 assert_symbol_pointer_size::<T>();
2051 Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2053 }
2054
2055 pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2056 let ptr = self.symbol_ptr(name);
2057 if ptr.is_null() {
2058 None
2059 } else {
2060 assert_symbol_pointer_size::<T>();
2061 Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2063 }
2064 }
2065
2066 fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2067 unsafe { dlsym(self.handle, name.as_ptr().cast()) }
2069 }
2070 }
2071
2072 impl Drop for Library {
2073 fn drop(&mut self) {
2074 if !self.handle.is_null() {
2075 unsafe {
2077 let _ = dlclose(self.handle);
2078 }
2079 }
2080 }
2081 }
2082}
2083
2084#[cfg(windows)]
2085mod dylib {
2086 use super::*;
2087 use std::os::windows::ffi::OsStrExt;
2088
2089 unsafe extern "system" {
2090 fn LoadLibraryW(lp_lib_file_name: *const u16) -> *mut c_void;
2091 fn GetProcAddress(h_module: *mut c_void, lp_proc_name: *const c_char) -> *mut c_void;
2092 fn FreeLibrary(h_lib_module: *mut c_void) -> c_int;
2093 }
2094
2095 pub(super) struct Library {
2096 handle: *mut c_void,
2097 }
2098
2099 impl Library {
2100 pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2101 let mut wide_path: Vec<u16> = path.as_os_str().encode_wide().collect();
2102 if wide_path.contains(&0) {
2103 return Err(EthosError::internal(
2104 "pdfium library path contains an interior NUL code unit",
2105 ));
2106 }
2107 wide_path.push(0);
2108 let handle = unsafe { LoadLibraryW(wide_path.as_ptr()) };
2110 if handle.is_null() {
2111 Err(EthosError::internal(
2112 "failed to load configured pdfium library",
2113 ))
2114 } else {
2115 Ok(Library { handle })
2116 }
2117 }
2118
2119 pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2120 let ptr = self.symbol_ptr(name);
2121 if ptr.is_null() {
2122 return Err(EthosError::internal(format!(
2123 "pdfium library is missing symbol {}",
2124 symbol_name(name)
2125 )));
2126 }
2127 assert_symbol_pointer_size::<T>();
2128 Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2130 }
2131
2132 pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2133 let ptr = self.symbol_ptr(name);
2134 if ptr.is_null() {
2135 None
2136 } else {
2137 assert_symbol_pointer_size::<T>();
2138 Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2140 }
2141 }
2142
2143 fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2144 unsafe { GetProcAddress(self.handle, name.as_ptr().cast()) }
2146 }
2147 }
2148
2149 impl Drop for Library {
2150 fn drop(&mut self) {
2151 if !self.handle.is_null() {
2152 unsafe {
2154 let _ = FreeLibrary(self.handle);
2155 }
2156 }
2157 }
2158 }
2159}
2160
2161fn assert_symbol_pointer_size<T>() {
2162 const {
2163 assert!(
2164 std::mem::size_of::<T>() == std::mem::size_of::<*mut c_void>(),
2165 "pdfium symbol pointer size mismatch"
2166 );
2167 }
2168}
2169
2170fn symbol_name(name: &'static [u8]) -> String {
2171 let name = name.strip_suffix(b"\0").unwrap_or(name);
2172 String::from_utf8_lossy(name).into_owned()
2173}
2174
2175#[cfg(test)]
2176mod tests {
2177 use super::*;
2178
2179 #[test]
2180 fn invalid_pdf_fails_before_library_load() {
2181 let err = PdfiumBackend::default()
2182 .page_count(b"not a pdf")
2183 .unwrap_err();
2184 assert_eq!(err.code, ErrorCode::InvalidPdf);
2185 }
2186
2187 #[test]
2188 fn text_run_breaks_on_pdfium_control_characters() {
2189 assert!(should_break_text_run('\0'));
2190 assert!(should_break_text_run('\n'));
2191 assert!(should_break_text_run('\u{0002}'));
2192 assert!(!should_break_text_run('-'));
2193 assert!(!should_break_text_run('A'));
2194 }
2195
2196 #[test]
2197 fn missing_library_path_is_stable_error_for_pdf_input() {
2198 let backend = PdfiumBackend::default();
2199 if env::var_os(PDFIUM_LIBRARY_PATH_ENV).is_some() {
2200 return;
2201 }
2202 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2203 assert_eq!(err.code, ErrorCode::InternalError);
2204 assert!(err.message.contains(PDFIUM_LIBRARY_PATH_ENV));
2205 }
2206
2207 #[test]
2208 fn render_crop_raw_rejects_zero_page_before_library_load() {
2209 let err = PdfiumBackend::default()
2210 .render_crop_raw(b"%PDF-1.7\n", 0, QRect::new(0, 0, 100, 100).unwrap())
2211 .unwrap_err();
2212 assert_eq!(err.code, ErrorCode::PageLimitExceeded);
2213 assert_eq!(err.message, "page selection out of document range");
2214 }
2215
2216 #[test]
2217 fn crop_window_uses_outward_quantized_pixel_bounds() {
2218 assert_eq!(
2219 crop_window(QRect::new(7392, 5482, 19378, 7226).unwrap(), 300, 144).unwrap(),
2220 (73, 54, 121, 19)
2221 );
2222 assert_eq!(
2223 crop_window(QRect::new(-50, -50, 30100, 14500).unwrap(), 300, 144).unwrap(),
2224 (0, 0, 300, 144)
2225 );
2226
2227 let err = crop_window(QRect::new(100, 100, 101, 101).unwrap(), 1, 1).unwrap_err();
2228 assert_eq!(err.code, ErrorCode::InternalError);
2229 assert_eq!(err.message, "crop bbox has no positive pixel extent");
2230 }
2231
2232 #[test]
2233 fn render_crop_raw_is_deterministic_when_pdfium_is_configured() {
2234 let Some(path) = env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from) else {
2235 return;
2236 };
2237 if !path.is_file() {
2238 return;
2239 }
2240
2241 let fixture = Path::new(env!("CARGO_MANIFEST_DIR"))
2242 .join("../../fixtures/synthetic/simple-text/document.pdf");
2243 let pdf_bytes = std::fs::read(fixture).unwrap();
2244 let bbox = QRect::new(7392, 5482, 19378, 7226).unwrap();
2245 let backend = PdfiumBackend::default();
2246
2247 let first = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2248 let second = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2249
2250 assert_eq!(first, second);
2251 assert_eq!(first.page_index, 1);
2252 assert_eq!(first.bbox, bbox);
2253 assert_eq!(first.width_px, 121);
2254 assert_eq!(first.height_px, 19);
2255 assert_eq!(first.stride, first.width_px * 4);
2256 assert_eq!(first.pixel_format, "bgra_8u");
2257 assert_eq!(
2258 first.bytes.len(),
2259 usize::try_from(first.stride * first.height_px).unwrap()
2260 );
2261 assert_eq!(
2262 first.sha256,
2263 ethos_core::c14n::sha256_hex_bytes(&first.bytes)
2264 );
2265 assert!(first
2266 .bytes
2267 .chunks_exact(4)
2268 .any(|pixel| pixel != [255, 255, 255, 255]));
2269 }
2270
2271 #[test]
2272 fn invalid_configured_library_path_does_not_leak_host_path() {
2273 let path = env::temp_dir().join("ethos-missing-libpdfium\nwith-control.dylib");
2274 let backend = PdfiumBackend::from_library_path(&path);
2275 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2276 assert_eq!(err.code, ErrorCode::InternalError);
2277 assert_eq!(err.message, "pdfium library path does not point to a file");
2278 assert!(!err.message.contains(path.to_string_lossy().as_ref()));
2279 }
2280
2281 #[test]
2282 fn explicit_manifest_hashes_library_bytes() {
2283 let path = env::temp_dir().join("ethos-test-libpdfium-hash.bin");
2284 std::fs::write(&path, b"pdfium bytes").unwrap();
2285 let backend = PdfiumBackend::from_library_path(&path).with_version("test-version");
2286 let manifest = backend.manifest();
2287 assert_eq!(manifest.id, "pdfium");
2288 assert_eq!(manifest.phase, 1);
2289 assert_eq!(manifest.version, "test-version");
2290 assert_eq!(
2291 manifest.platform_sha256,
2292 ethos_core::c14n::sha256_hex_bytes(b"pdfium bytes")
2293 );
2294 let _ = std::fs::remove_file(path);
2295 }
2296
2297 #[test]
2298 fn phase1_pdfium_profile_is_pinned_and_v8_xfa_disabled() {
2299 let profile = pinned_pdfium_profile();
2300 assert_eq!(profile.id, "pdfium");
2301 assert_eq!(profile.phase, 1);
2302 assert_eq!(profile.version, "chromium/7881");
2303 assert_eq!(profile.upstream_version, "PDFium 151.0.7881.0");
2304 assert_eq!(profile.v8, "disabled");
2305 assert_eq!(profile.xfa, "disabled");
2306 assert_eq!(profile.distribution.source, "bblanchon/pdfium-binaries");
2307 assert_eq!(
2308 profile.distribution.attestation.sha256,
2309 "24dec7cd76acb81106a0c29b908cceceef8215b050f6ff6ffbf875465811ef60"
2310 );
2311 assert!(!profile.build_flags.pdf_enable_v8);
2312 assert!(!profile.build_flags.pdf_enable_xfa);
2313 assert!(profile.build_flags.pdf_is_standalone);
2314
2315 let expected = [
2316 (
2317 "macos-arm64",
2318 "pdfium-mac-arm64.tgz",
2319 "52e94ca5aa8847934330daf3f8150c190682c5ca93831468794f8b90d4392e40",
2320 "lib/libpdfium.dylib",
2321 "1bc45b15466b34cef96641ce25c77a876e70010c6b114f909dda2f5325fc5bd7",
2322 ),
2323 (
2324 "linux-x64",
2325 "pdfium-linux-x64.tgz",
2326 "1470e21b8b4a3b4ad7f85684e2da11d94f3b69a86d81dee11b9b6709d927ac1d",
2327 "lib/libpdfium.so",
2328 "f728930966f503652b92acc89b9374a2eeca00ce42e26dccd3e4b5c5161b2d64",
2329 ),
2330 (
2331 "windows-x64",
2332 "pdfium-win-x64.tgz",
2333 "73cc0de638ac2095e7445bf56a38200a5b7c7ca0e9f4ba144598f2457377ac08",
2334 "bin/pdfium.dll",
2335 "79d4676b656cfb1abcea88f9ade3b4b0826c5200382db5f4ec72a636c598c118",
2336 ),
2337 ];
2338 for (platform, name, archive_sha256, runtime_path, runtime_sha256) in expected {
2339 assert_eq!(profile.platform_hashes[platform], archive_sha256);
2340 let artifact = &profile.platform_artifacts[platform];
2341 assert_eq!(artifact.name, name);
2342 assert!(!artifact.name.contains("-v8-"));
2343 assert!(!artifact.name.contains("xfa"));
2344 assert_eq!(artifact.runtime_library_path, runtime_path);
2345 assert_eq!(artifact.runtime_library_sha256, runtime_sha256);
2346 }
2347 }
2348
2349 #[test]
2350 fn mismatched_pdfium_version_is_rejected_before_library_load() {
2351 if current_platform_key().is_none() {
2352 return;
2353 }
2354 let path = env::temp_dir().join("ethos-test-libpdfium-version-mismatch.bin");
2355 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2356 let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7869");
2357 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2358 assert_eq!(err.code, ErrorCode::InternalError);
2359 assert_eq!(
2360 err.message,
2361 "pdfium version does not match pinned phase 1 profile"
2362 );
2363 let _ = std::fs::remove_file(path);
2364 }
2365
2366 #[test]
2367 fn pinned_upstream_pdfium_version_alias_is_accepted() {
2368 if current_platform_key().is_none() {
2369 return;
2370 }
2371 let path = env::temp_dir().join("ethos-test-libpdfium-upstream-version.bin");
2372 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2373 let backend = PdfiumBackend::from_library_path(&path).with_version("PDFium 151.0.7881.0");
2374 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2375 assert_eq!(err.code, ErrorCode::InternalError);
2376 assert_eq!(
2377 err.message,
2378 "pdfium library does not match pinned phase 1 profile"
2379 );
2380 let _ = std::fs::remove_file(path);
2381 }
2382
2383 #[test]
2384 fn mismatched_pdfium_artifact_is_rejected_with_stable_error() {
2385 if current_platform_key().is_none() {
2386 return;
2387 }
2388 let library_path = env::temp_dir().join("ethos-test-libpdfium-artifact-mismatch.bin");
2389 let artifact_path = env::temp_dir().join("ethos-test-pdfium-artifact-mismatch.tgz");
2390 std::fs::write(&library_path, b"not the pinned pdfium library").unwrap();
2391 std::fs::write(&artifact_path, b"not the pinned pdfium artifact").unwrap();
2392 let backend = PdfiumBackend::from_library_path(&library_path)
2393 .with_version("chromium/7881")
2394 .with_artifact_path(&artifact_path);
2395 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2396 assert_eq!(err.code, ErrorCode::InternalError);
2397 assert_eq!(
2398 err.message,
2399 "pdfium artifact does not match pinned phase 1 profile"
2400 );
2401 let _ = std::fs::remove_file(library_path);
2402 let _ = std::fs::remove_file(artifact_path);
2403 }
2404
2405 #[test]
2406 fn mismatched_pdfium_library_is_rejected_before_dynamic_load() {
2407 if current_platform_key().is_none() {
2408 return;
2409 }
2410 let path = env::temp_dir().join("ethos-test-libpdfium-library-mismatch.bin");
2411 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2412 let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7881");
2413 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2414 assert_eq!(err.code, ErrorCode::InternalError);
2415 assert_eq!(
2416 err.message,
2417 "pdfium library does not match pinned phase 1 profile"
2418 );
2419 let _ = std::fs::remove_file(path);
2420 }
2421
2422 #[test]
2423 fn deterministic_font_ids_strip_subset_prefixes() {
2424 assert_eq!(
2425 deterministic_font_id("ABCDEF+MinionPro-Regular").as_deref(),
2426 Some("embedded:MinionPro-Regular")
2427 );
2428 assert_eq!(
2429 deterministic_font_id("Helvetica-Bold").as_deref(),
2430 Some("subst:liberation-sans-bold")
2431 );
2432 assert_eq!(
2433 deterministic_font_id("Helvetica").as_deref(),
2434 Some("subst:liberation-sans-regular")
2435 );
2436 assert_eq!(
2437 deterministic_font_id("Helvetica-Oblique").as_deref(),
2438 Some("subst:liberation-sans-italic")
2439 );
2440 assert_eq!(
2441 deterministic_font_id("Helvetica-BoldOblique").as_deref(),
2442 Some("subst:liberation-sans-bold-italic")
2443 );
2444 assert_eq!(
2445 deterministic_font_id("Courier").as_deref(),
2446 Some("subst:liberation-mono-regular")
2447 );
2448 assert_eq!(
2449 deterministic_font_id("Times-Roman").as_deref(),
2450 Some("subst:liberation-serif-regular")
2451 );
2452 assert_eq!(
2453 deterministic_font_id("Custom Font/Regular").as_deref(),
2454 Some("subst:liberation-sans-regular")
2455 );
2456 assert_eq!(deterministic_font_id(" "), None);
2457 }
2458
2459 #[test]
2460 fn deterministic_font_ids_keep_embedded_ids_ascii_only() {
2461 let unsafe_unicode = deterministic_font_id("ABCDEF+明朝").unwrap();
2462 assert_eq!(unsafe_unicode, hashed_embedded_font_id("明朝"));
2463 assert!(unsafe_unicode.is_ascii());
2464
2465 let unsafe_punctuation = deterministic_font_id("ABCDEF+Fixture+Font").unwrap();
2466 assert_eq!(unsafe_punctuation, hashed_embedded_font_id("Fixture+Font"));
2467 assert!(unsafe_punctuation.is_ascii());
2468
2469 let separator_only = deterministic_font_id("ABCDEF+///").unwrap();
2470 assert_eq!(separator_only, hashed_embedded_font_id("///"));
2471 assert!(separator_only.is_ascii());
2472
2473 assert_eq!(
2474 deterministic_font_id("明朝").as_deref(),
2475 Some("subst:liberation-sans-regular")
2476 );
2477 }
2478
2479 #[test]
2480 fn font_substitution_table_is_well_formed() {
2481 use std::collections::HashSet;
2482
2483 let table = font_substitution_table();
2484 assert_eq!(table.schema_version, "1.0.0");
2485 assert_eq!(table.table_id, "ethos-font-substitution-v1");
2486 assert_eq!(table.version, "1.0.0");
2487 assert_eq!(
2488 table.default_unresolved_font_id,
2489 "subst:liberation-sans-regular"
2490 );
2491
2492 let mut seen = HashSet::new();
2493 for mapping in &table.mappings {
2494 assert!(!mapping.source.is_empty());
2495 assert!(mapping.font_id.starts_with("subst:"));
2496 assert!(
2497 seen.insert(mapping.source.as_str()),
2498 "duplicate font substitution source {}",
2499 mapping.source
2500 );
2501 }
2502 assert_eq!(table.mappings.len(), 14);
2503 }
2504
2505 #[test]
2506 fn profile_pins_font_substitution_table_bytes() {
2507 const FONT_SUBSTITUTION_TABLE_PATH: &str =
2508 "crates/ethos-pdf/assets/font-substitution-table.json";
2509 let profile: serde_json::Value = serde_json::from_str(include_str!(concat!(
2510 env!("CARGO_MANIFEST_DIR"),
2511 "/../../profiles/ethos-deterministic-v1.json"
2512 )))
2513 .unwrap();
2514 let pin = &profile["font_policy"]["substitution_table"];
2515 assert_eq!(pin["path"], FONT_SUBSTITUTION_TABLE_PATH);
2516 assert_eq!(
2517 pin["sha256"],
2518 ethos_core::c14n::sha256_hex_bytes(FONT_SUBSTITUTION_TABLE_JSON.as_bytes())
2519 );
2520 }
2521}