1#![deny(unsafe_op_in_unsafe_fn)]
28#![warn(missing_docs)]
29
30use std::collections::{BTreeMap, HashSet};
31use std::env;
32use std::ffi::{c_char, c_int, c_ulong, c_void, CString};
33use std::path::{Path, PathBuf};
34use std::ptr;
35use std::slice;
36use std::sync::{Mutex, OnceLock};
37
38use ethos_core::codes::WarningCode;
39use ethos_core::config::{PageSelection, ParseConfig};
40use ethos_core::error::{ErrorCode, EthosError};
41use ethos_core::geom::{quantize, QRect};
42use ethos_core::ids::{page_id, span_id, warning_id};
43use ethos_core::model::{Page, Span, SpanOriginLocator, Warning};
44use ethos_core::traits::{BackendManifest, EthosPdfBackend, Extraction};
45use serde::{Deserialize, Serialize};
46
47pub const PDFIUM_LIBRARY_PATH_ENV: &str = "ETHOS_PDFIUM_LIBRARY_PATH";
49
50pub const PDFIUM_VERSION_ENV: &str = "ETHOS_PDFIUM_VERSION";
52
53pub const PDFIUM_ARTIFACT_PATH_ENV: &str = "ETHOS_PDFIUM_ARTIFACT_PATH";
55
56const PDFIUM_SETUP_GUIDANCE: &str =
57 "Run ethos doctor for setup diagnostics, run ethos doctor --require-pdfium after setting it, and see docs/pdfium-manual-setup.md.";
58
59pub const QUANTUM_PER_POINT: u32 = 100;
61const ORIGIN_LOCATOR_POLICY: &str = "origin-run-locator-v1";
62
63const DETERMINISTIC_PROFILE_JSON: &str = include_str!("../assets/ethos-deterministic-v1.json");
64const FONT_SUBSTITUTION_TABLE_JSON: &str = include_str!("../assets/font-substitution-table.json");
65
66static PDFIUM_LOCK: Mutex<()> = Mutex::new(());
68static PINNED_PDFIUM_PROFILE: OnceLock<PinnedPdfiumBackend> = OnceLock::new();
69static FONT_SUBSTITUTION_TABLE: OnceLock<FontSubstitutionTable> = OnceLock::new();
70
71#[derive(Debug, Clone, Default)]
73pub struct PdfiumBackend {
74 library_path: Option<PathBuf>,
75 artifact_path: Option<PathBuf>,
76 version: Option<String>,
77}
78
79#[derive(Debug, Serialize)]
85pub struct GeometryProbeReport {
86 pub schema_version: String,
88 pub quantum_per_point: u32,
90 pub backend: BackendManifest,
92 pub pages: Vec<GeometryProbePage>,
94}
95
96#[derive(Debug, Serialize)]
98pub struct GeometryProbePage {
99 pub id: String,
101 pub index: u32,
103 pub width: i64,
105 pub height: i64,
107 pub rotation: u16,
109 pub char_count: i32,
111 pub symbols: GeometryProbeSymbols,
113 pub chars: Vec<GeometryProbeChar>,
115 pub runs: Vec<GeometryProbeRun>,
117}
118
119#[derive(Debug, Serialize)]
121pub struct GeometryProbeSymbols {
122 pub char_origin: bool,
124 pub loose_char_box: bool,
126 pub text_rects: bool,
128}
129
130#[derive(Debug, Serialize)]
132pub struct GeometryProbeChar {
133 pub index: i32,
135 pub unicode: u32,
137 pub text: Option<String>,
139 pub parser_action: String,
141 pub char_box: Option<QRect>,
143 pub loose_char_box: Option<QRect>,
145 pub char_origin: Option<[i64; 2]>,
147 pub font_id: Option<String>,
149 pub font_flags: Option<u32>,
151 pub font_size_q: Option<i64>,
153}
154
155#[derive(Debug, Serialize)]
157pub struct GeometryProbeRun {
158 pub index: u32,
160 pub text: String,
162 pub char_start: i32,
164 pub char_end: i32,
166 pub char_indices: Vec<i32>,
168 pub char_box_union: Option<QRect>,
170 pub loose_char_box_union: Option<QRect>,
172 pub text_rects: Vec<QRect>,
174 pub text_rect_union: Option<QRect>,
176 pub first_origin: Option<[i64; 2]>,
178 pub last_origin: Option<[i64; 2]>,
180 pub font_id: Option<String>,
182 pub font_flags: Option<u32>,
184 pub font_size_q: Option<i64>,
186}
187
188#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RawCrop {
195 pub page_index: u32,
197 pub bbox: QRect,
199 pub width_px: u32,
201 pub height_px: u32,
203 pub stride: u32,
205 pub pixel_format: &'static str,
207 pub sha256: String,
209 pub bytes: Vec<u8>,
211}
212
213impl PdfiumBackend {
214 pub fn from_library_path(path: impl Into<PathBuf>) -> Self {
216 PdfiumBackend {
217 library_path: Some(path.into()),
218 artifact_path: None,
219 version: None,
220 }
221 }
222
223 pub fn with_artifact_path(mut self, path: impl Into<PathBuf>) -> Self {
225 self.artifact_path = Some(path.into());
226 self
227 }
228
229 pub fn with_version(mut self, version: impl Into<String>) -> Self {
231 self.version = Some(version.into());
232 self
233 }
234
235 fn configured_library_path(&self) -> Option<PathBuf> {
236 self.library_path
237 .clone()
238 .or_else(|| env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from))
239 }
240
241 fn configured_artifact_path(&self) -> Option<PathBuf> {
242 self.artifact_path
243 .clone()
244 .or_else(|| env::var_os(PDFIUM_ARTIFACT_PATH_ENV).map(PathBuf::from))
245 }
246
247 fn configured_version_override(&self) -> Option<String> {
248 self.version
249 .clone()
250 .or_else(|| env::var(PDFIUM_VERSION_ENV).ok())
251 }
252
253 fn configured_version(&self) -> String {
254 self.configured_version_override()
255 .unwrap_or_else(|| pinned_pdfium_profile().version.clone())
256 }
257
258 pub fn probe_library(&self) -> Result<BackendManifest, EthosError> {
264 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
265 let runtime = PdfiumRuntime::load(self)?;
266 drop(runtime);
267 Ok(self.manifest())
268 }
269
270 pub fn geometry_probe(
276 &self,
277 pdf_bytes: &[u8],
278 config: &ParseConfig,
279 ) -> Result<GeometryProbeReport, EthosError> {
280 validate_pdf_header(pdf_bytes)?;
281 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
282 let runtime = PdfiumRuntime::load(self)?;
283 let doc = runtime.load_document(pdf_bytes)?;
284 let page_count = doc.page_count()?;
285 if page_count > config.limits.max_pages {
286 return Err(EthosError::new(
287 ErrorCode::PageLimitExceeded,
288 "page count exceeds configured limit",
289 ));
290 }
291 validate_page_selection(&config.pages, page_count)?;
292
293 let mut pages = Vec::new();
294 for page_index in 0..page_count {
295 let original_page = page_index + 1;
296 if !config.pages.contains(original_page) {
297 continue;
298 }
299 let page = doc.load_page(page_index)?;
300 pages.push(page.geometry_probe_page(original_page)?);
301 }
302
303 Ok(GeometryProbeReport {
304 schema_version: "ethos-pdfium-geometry-probe-v1".to_string(),
305 quantum_per_point: QUANTUM_PER_POINT,
306 backend: self.manifest(),
307 pages,
308 })
309 }
310
311 pub fn render_crop_raw(
317 &self,
318 pdf_bytes: &[u8],
319 page_index: u32,
320 bbox: QRect,
321 ) -> Result<RawCrop, EthosError> {
322 validate_pdf_header(pdf_bytes)?;
323 if page_index == 0 {
324 return Err(EthosError::new(
325 ErrorCode::PageLimitExceeded,
326 "page selection out of document range",
327 ));
328 }
329 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
330 let runtime = PdfiumRuntime::load(self)?;
331 let doc = runtime.load_document(pdf_bytes)?;
332 let page_count = doc.page_count()?;
333 if page_index > page_count {
334 return Err(EthosError::new(
335 ErrorCode::PageLimitExceeded,
336 "page selection out of document range",
337 ));
338 }
339 let page = doc.load_page(page_index - 1)?;
340 page.render_crop_raw(page_index, bbox)
341 }
342}
343
344impl EthosPdfBackend for PdfiumBackend {
345 fn manifest(&self) -> BackendManifest {
346 let platform_sha256 = self
347 .configured_library_path()
348 .and_then(|path| std::fs::read(path).ok())
349 .map(|bytes| ethos_core::c14n::sha256_hex_bytes(&bytes))
350 .unwrap_or_else(|| "0".repeat(64));
351 BackendManifest {
352 id: "pdfium".to_string(),
353 phase: 1,
354 version: self.configured_version(),
355 platform_sha256,
356 }
357 }
358
359 fn page_count(&self, pdf_bytes: &[u8]) -> Result<u32, EthosError> {
360 validate_pdf_header(pdf_bytes)?;
361 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
362 let runtime = PdfiumRuntime::load(self)?;
363 let doc = runtime.load_document(pdf_bytes)?;
364 doc.page_count()
365 }
366
367 fn extract(&self, pdf_bytes: &[u8], config: &ParseConfig) -> Result<Extraction, EthosError> {
368 validate_pdf_header(pdf_bytes)?;
369 let _guard = PDFIUM_LOCK.lock().unwrap_or_else(|e| e.into_inner());
370 let runtime = PdfiumRuntime::load(self)?;
371 let doc = runtime.load_document(pdf_bytes)?;
372 let page_count = doc.page_count()?;
373 if page_count > config.limits.max_pages {
374 return Err(EthosError::new(
375 ErrorCode::PageLimitExceeded,
376 "page count exceeds configured limit",
377 ));
378 }
379 validate_page_selection(&config.pages, page_count)?;
380
381 let mut pages = Vec::new();
382 let mut spans = Vec::new();
383 let mut warnings = Vec::new();
384 let mut next_span = 1u32;
385 let mut next_warning = 1u32;
386
387 for page_index in 0..page_count {
388 let original_page = page_index + 1;
389 if !config.pages.contains(original_page) {
390 continue;
391 }
392 let page = doc.load_page(page_index)?;
393 let page_model = page.model_page(original_page)?;
394 let span_count_before = spans.len();
395 page.extract_text_spans(&page_model, &mut next_span, &mut spans)?;
396 if spans.len() == span_count_before {
397 warnings.push(Warning {
398 id: warning_id(next_warning)?,
399 code: WarningCode::ImageOnlyPage,
400 message: "page has no extractable text; OCR is required for this page"
401 .to_string(),
402 page: Some(page_model.id.clone()),
403 element_ref: None,
404 span_ref: None,
405 region_ref: None,
406 });
407 next_warning += 1;
408 }
409 pages.push(page_model);
410 }
411
412 if spans.is_empty() {
413 return Err(EthosError::new(
414 ErrorCode::OcrRequired,
415 "no extractable text; OCR is required",
416 ));
417 }
418
419 Ok(Extraction {
420 pages,
421 spans,
422 regions: Vec::new(),
423 warnings,
424 })
425 }
426}
427
428fn validate_page_selection(selection: &PageSelection, page_count: u32) -> Result<(), EthosError> {
429 selection.validate_against(page_count).map_err(|_| {
430 EthosError::new(
431 ErrorCode::PageLimitExceeded,
432 "page selection out of document range",
433 )
434 })
435}
436
437fn validate_pdf_header(pdf_bytes: &[u8]) -> Result<(), EthosError> {
438 let window = &pdf_bytes[..pdf_bytes.len().min(1024)];
439 if window.windows(5).any(|w| w == b"%PDF-") {
440 Ok(())
441 } else {
442 Err(EthosError::new(
443 ErrorCode::InvalidPdf,
444 "input does not contain a PDF header",
445 ))
446 }
447}
448
449fn quantize_coord(value: f64) -> Result<i64, EthosError> {
450 quantize(value, QUANTUM_PER_POINT)
451 .map_err(|_| EthosError::new(ErrorCode::InternalError, "coordinate quantization failed"))
452}
453
454fn pixel_extent(points: f64) -> Result<u32, EthosError> {
455 if !points.is_finite() || points <= 0.0 {
456 return Err(EthosError::new(
457 ErrorCode::CorruptPdf,
458 "PDF page has invalid dimensions",
459 ));
460 }
461 if points.ceil() > f64::from(c_int::MAX) {
462 return Err(EthosError::internal("render bitmap dimension overflow"));
463 }
464 Ok(points.ceil() as u32)
465}
466
467fn floor_quantized_pixel(value: i64) -> i64 {
468 value.div_euclid(i64::from(QUANTUM_PER_POINT))
469}
470
471fn ceil_quantized_pixel(value: i64) -> i64 {
472 let quantum = i64::from(QUANTUM_PER_POINT);
473 value
474 .checked_add(quantum - 1)
475 .unwrap_or(i64::MAX)
476 .div_euclid(quantum)
477}
478
479fn clamp_pixel(value: i64, max: u32) -> u32 {
480 value.clamp(0, i64::from(max)) as u32
481}
482
483fn crop_window(
484 bbox: QRect,
485 page_width_px: u32,
486 page_height_px: u32,
487) -> Result<(u32, u32, u32, u32), EthosError> {
488 let x0 = clamp_pixel(floor_quantized_pixel(bbox.x0), page_width_px);
489 let y0 = clamp_pixel(floor_quantized_pixel(bbox.y0), page_height_px);
490 let x1 = clamp_pixel(ceil_quantized_pixel(bbox.x1), page_width_px);
491 let y1 = clamp_pixel(ceil_quantized_pixel(bbox.y1), page_height_px);
492 if x0 >= x1 || y0 >= y1 {
493 return Err(EthosError::internal(
494 "crop bbox has no positive pixel extent",
495 ));
496 }
497 Ok((x0, y0, x1 - x0, y1 - y0))
498}
499
500fn qrect_from_pdfium_char_box(
501 page_height_pts: f64,
502 left: f64,
503 right: f64,
504 bottom: f64,
505 top: f64,
506) -> Result<QRect, EthosError> {
507 let x0 = left.min(right);
508 let x1 = left.max(right);
509 let y0 = page_height_pts - top.max(bottom);
510 let y1 = page_height_pts - top.min(bottom);
511 QRect::new(
512 quantize_coord(x0)?,
513 quantize_coord(y0)?,
514 quantize_coord(x1)?,
515 quantize_coord(y1)?,
516 )
517 .map_err(|_| EthosError::internal("malformed character bbox"))
518}
519
520fn union_rect(a: QRect, b: QRect) -> QRect {
521 QRect {
522 x0: a.x0.min(b.x0),
523 y0: a.y0.min(b.y0),
524 x1: a.x1.max(b.x1),
525 y1: a.y1.max(b.y1),
526 }
527}
528
529fn map_pdfium_error(code: c_ulong) -> EthosError {
530 match code {
531 4 => EthosError::new(
532 ErrorCode::PasswordProtected,
533 "document is encrypted or password-protected",
534 ),
535 5 => EthosError::new(
536 ErrorCode::UnsupportedPdfFeature,
537 "document uses a restricted security handler",
538 ),
539 3 => EthosError::new(ErrorCode::CorruptPdf, "PDF structure is corrupt"),
540 6 => EthosError::new(ErrorCode::CorruptPdf, "PDF page tree is corrupt"),
541 2 => EthosError::new(ErrorCode::CorruptPdf, "PDF could not be loaded"),
542 _ => EthosError::new(ErrorCode::CorruptPdf, "PDFium could not load the document"),
543 }
544}
545
546#[derive(Debug, Deserialize)]
547struct DeterministicProfile {
548 backend: PinnedPdfiumBackend,
549}
550
551#[derive(Debug, Deserialize)]
552struct PinnedPdfiumBackend {
553 id: String,
554 phase: u8,
555 version: String,
556 upstream_version: String,
557 v8: String,
558 xfa: String,
559 distribution: PinnedPdfiumDistribution,
560 build_flags: PinnedPdfiumBuildFlags,
561 platform_hashes: BTreeMap<String, String>,
562 platform_artifacts: BTreeMap<String, PinnedPdfiumArtifact>,
563 profile_doc: String,
564}
565
566#[derive(Debug, Deserialize)]
567struct PinnedPdfiumDistribution {
568 source: String,
569 release_url: String,
570 published_at: String,
571 attestation: PinnedPdfiumAttestation,
572}
573
574#[derive(Debug, Deserialize)]
575struct PinnedPdfiumAttestation {
576 name: String,
577 sha256: String,
578}
579
580#[derive(Debug, Deserialize)]
581struct PinnedPdfiumBuildFlags {
582 is_component_build: bool,
583 is_debug: bool,
584 pdf_enable_v8: bool,
585 pdf_enable_xfa: bool,
586 pdf_is_standalone: bool,
587 pdf_use_partition_alloc: bool,
588}
589
590#[derive(Debug, Deserialize)]
591struct PinnedPdfiumArtifact {
592 name: String,
593 target_os: String,
594 target_cpu: String,
595 runtime_library_path: String,
596 runtime_library_sha256: String,
597}
598
599fn pinned_pdfium_profile() -> &'static PinnedPdfiumBackend {
600 PINNED_PDFIUM_PROFILE.get_or_init(|| {
601 let profile: DeterministicProfile = serde_json::from_str(DETERMINISTIC_PROFILE_JSON)
602 .expect("profiles/ethos-deterministic-v1.json is valid JSON");
603 validate_pinned_pdfium_profile(&profile.backend)
604 .expect("profiles/ethos-deterministic-v1.json pins a valid PDFium Phase 1 profile");
605 profile.backend
606 })
607}
608
609fn validate_pinned_pdfium_profile(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
610 validate_pinned_pdfium_identity(profile)?;
611 validate_pinned_pdfium_distribution(&profile.distribution)?;
612 validate_pinned_pdfium_build_flags(&profile.build_flags)?;
613 validate_pinned_pdfium_platforms(profile)?;
614 Ok(())
615}
616
617fn validate_pinned_pdfium_identity(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
618 if profile.id != "pdfium"
619 || profile.phase != 1
620 || profile.version != "chromium/7881"
621 || profile.upstream_version != "PDFium 151.0.7881.0"
622 || profile.v8 != "disabled"
623 || profile.xfa != "disabled"
624 || profile.profile_doc != "docs/pdfium-profile.md"
625 {
626 return Err("unexpected PDFium profile identity");
627 }
628 Ok(())
629}
630
631fn validate_pinned_pdfium_distribution(
632 distribution: &PinnedPdfiumDistribution,
633) -> Result<(), &'static str> {
634 if distribution.source != "bblanchon/pdfium-binaries"
635 || distribution.attestation.name != "pdfium-attestation.json"
636 || !is_sha256_hex(&distribution.attestation.sha256)
637 || !distribution
638 .release_url
639 .starts_with("https://github.com/bblanchon/pdfium-binaries/releases/tag/")
640 || !distribution.published_at.ends_with('Z')
641 {
642 return Err("unexpected PDFium distribution metadata");
643 }
644 Ok(())
645}
646
647fn validate_pinned_pdfium_build_flags(
648 build_flags: &PinnedPdfiumBuildFlags,
649) -> Result<(), &'static str> {
650 if build_flags.is_component_build
651 || build_flags.is_debug
652 || build_flags.pdf_enable_v8
653 || build_flags.pdf_enable_xfa
654 || !build_flags.pdf_is_standalone
655 || build_flags.pdf_use_partition_alloc
656 {
657 return Err("PDFium Phase 1 must be standalone release with V8/XFA disabled");
658 }
659 Ok(())
660}
661
662fn validate_pinned_pdfium_platforms(profile: &PinnedPdfiumBackend) -> Result<(), &'static str> {
663 for platform in ["macos-arm64", "linux-x64", "windows-x64"] {
664 let artifact_hash = profile
665 .platform_hashes
666 .get(platform)
667 .ok_or("missing PDFium artifact hash")?;
668 if !is_sha256_hex(artifact_hash) {
669 return Err("malformed PDFium artifact hash");
670 }
671 let artifact = profile
672 .platform_artifacts
673 .get(platform)
674 .ok_or("missing PDFium platform artifact metadata")?;
675 if artifact.name.contains("-v8-")
676 || artifact.name.contains("xfa")
677 || !artifact.name.ends_with(".tgz")
678 || artifact.runtime_library_path.is_empty()
679 || !is_sha256_hex(&artifact.runtime_library_sha256)
680 {
681 return Err("malformed PDFium platform artifact metadata");
682 }
683 match platform {
684 "macos-arm64"
685 if artifact.name == "pdfium-mac-arm64.tgz"
686 && artifact.target_os == "mac"
687 && artifact.target_cpu == "arm64" => {}
688 "linux-x64"
689 if artifact.name == "pdfium-linux-x64.tgz"
690 && artifact.target_os == "linux"
691 && artifact.target_cpu == "x64" => {}
692 "windows-x64"
693 if artifact.name == "pdfium-win-x64.tgz"
694 && artifact.target_os == "win"
695 && artifact.target_cpu == "x64" => {}
696 _ => return Err("unexpected PDFium platform artifact"),
697 }
698 }
699 Ok(())
700}
701
702fn is_sha256_hex(value: &str) -> bool {
703 value.len() == 64
704 && value
705 .bytes()
706 .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase())
707}
708
709fn current_platform_key() -> Option<&'static str> {
710 if cfg!(all(target_os = "macos", target_arch = "aarch64")) {
711 Some("macos-arm64")
712 } else if cfg!(all(target_os = "linux", target_arch = "x86_64")) {
713 Some("linux-x64")
714 } else if cfg!(all(target_os = "windows", target_arch = "x86_64")) {
715 Some("windows-x64")
716 } else {
717 None
718 }
719}
720
721fn current_pdfium_pins(
722 profile: &PinnedPdfiumBackend,
723) -> Result<(&'static str, &str, &PinnedPdfiumArtifact), EthosError> {
724 let platform = current_platform_key().ok_or_else(|| {
725 EthosError::internal("pdfium phase 1 profile has no hash for this platform")
726 })?;
727 let artifact_hash = profile.platform_hashes.get(platform).ok_or_else(|| {
728 EthosError::internal("pdfium phase 1 profile has no hash for this platform")
729 })?;
730 let artifact = profile.platform_artifacts.get(platform).ok_or_else(|| {
731 EthosError::internal("pdfium phase 1 profile has no artifact for this platform")
732 })?;
733 Ok((platform, artifact_hash.as_str(), artifact))
734}
735
736fn validate_pinned_pdfium_payload(
737 backend: &PdfiumBackend,
738 library_path: &Path,
739) -> Result<(), EthosError> {
740 let profile = pinned_pdfium_profile();
741 if let Some(version) = backend.configured_version_override() {
742 let upstream_number = profile
743 .upstream_version
744 .strip_prefix("PDFium ")
745 .unwrap_or(&profile.upstream_version);
746 if version != profile.version
747 && version != profile.upstream_version
748 && version != upstream_number
749 {
750 return Err(EthosError::internal(
751 "pdfium version does not match pinned phase 1 profile",
752 ));
753 }
754 }
755
756 let (_, artifact_hash, artifact) = current_pdfium_pins(profile)?;
757 if let Some(artifact_path) = backend.configured_artifact_path() {
758 if !artifact_path.is_file() {
759 return Err(EthosError::internal(
760 "pdfium artifact path does not point to a file",
761 ));
762 }
763 let actual_artifact_hash = sha256_file(&artifact_path)?;
764 if actual_artifact_hash != artifact_hash {
765 return Err(EthosError::internal(
766 "pdfium artifact does not match pinned phase 1 profile",
767 ));
768 }
769 }
770
771 let library_hash = sha256_file(library_path)?;
772 if library_hash != artifact.runtime_library_sha256 {
773 return Err(EthosError::internal(
774 "pdfium library does not match pinned phase 1 profile",
775 ));
776 }
777
778 Ok(())
779}
780
781fn sha256_file(path: &Path) -> Result<String, EthosError> {
782 let bytes =
783 std::fs::read(path).map_err(|_| EthosError::internal("failed to read pdfium payload"))?;
784 Ok(ethos_core::c14n::sha256_hex_bytes(&bytes))
785}
786
787type FpdfDocument = *mut c_void;
788type FpdfPage = *mut c_void;
789type FpdfTextPage = *mut c_void;
790type FpdfBitmap = *mut c_void;
791
792#[cfg(not(windows))]
793type FpdfInitLibrary = unsafe extern "C" fn();
794#[cfg(windows)]
795type FpdfInitLibrary = unsafe extern "system" fn();
796#[cfg(not(windows))]
797type FpdfDestroyLibrary = unsafe extern "C" fn();
798#[cfg(windows)]
799type FpdfDestroyLibrary = unsafe extern "system" fn();
800#[cfg(not(windows))]
801type FpdfLoadMemDocument64 =
802 unsafe extern "C" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
803#[cfg(windows)]
804type FpdfLoadMemDocument64 =
805 unsafe extern "system" fn(*const c_void, usize, *const c_char) -> FpdfDocument;
806#[cfg(not(windows))]
807type FpdfCloseDocument = unsafe extern "C" fn(FpdfDocument);
808#[cfg(windows)]
809type FpdfCloseDocument = unsafe extern "system" fn(FpdfDocument);
810#[cfg(not(windows))]
811type FpdfGetLastError = unsafe extern "C" fn() -> c_ulong;
812#[cfg(windows)]
813type FpdfGetLastError = unsafe extern "system" fn() -> c_ulong;
814#[cfg(not(windows))]
815type FpdfGetPageCount = unsafe extern "C" fn(FpdfDocument) -> c_int;
816#[cfg(windows)]
817type FpdfGetPageCount = unsafe extern "system" fn(FpdfDocument) -> c_int;
818#[cfg(not(windows))]
819type FpdfLoadPage = unsafe extern "C" fn(FpdfDocument, c_int) -> FpdfPage;
820#[cfg(windows)]
821type FpdfLoadPage = unsafe extern "system" fn(FpdfDocument, c_int) -> FpdfPage;
822#[cfg(not(windows))]
823type FpdfClosePage = unsafe extern "C" fn(FpdfPage);
824#[cfg(windows)]
825type FpdfClosePage = unsafe extern "system" fn(FpdfPage);
826#[cfg(not(windows))]
827type FpdfGetPageWidthF = unsafe extern "C" fn(FpdfPage) -> f32;
828#[cfg(windows)]
829type FpdfGetPageWidthF = unsafe extern "system" fn(FpdfPage) -> f32;
830#[cfg(not(windows))]
831type FpdfGetPageHeightF = unsafe extern "C" fn(FpdfPage) -> f32;
832#[cfg(windows)]
833type FpdfGetPageHeightF = unsafe extern "system" fn(FpdfPage) -> f32;
834#[cfg(not(windows))]
835type FpdfPageGetRotation = unsafe extern "C" fn(FpdfPage) -> c_int;
836#[cfg(windows)]
837type FpdfPageGetRotation = unsafe extern "system" fn(FpdfPage) -> c_int;
838#[cfg(not(windows))]
839type FpdfTextLoadPage = unsafe extern "C" fn(FpdfPage) -> FpdfTextPage;
840#[cfg(windows)]
841type FpdfTextLoadPage = unsafe extern "system" fn(FpdfPage) -> FpdfTextPage;
842#[cfg(not(windows))]
843type FpdfTextClosePage = unsafe extern "C" fn(FpdfTextPage);
844#[cfg(windows)]
845type FpdfTextClosePage = unsafe extern "system" fn(FpdfTextPage);
846#[cfg(not(windows))]
847type FpdfTextCountChars = unsafe extern "C" fn(FpdfTextPage) -> c_int;
848#[cfg(windows)]
849type FpdfTextCountChars = unsafe extern "system" fn(FpdfTextPage) -> c_int;
850#[cfg(not(windows))]
851type FpdfTextGetUnicode = unsafe extern "C" fn(FpdfTextPage, c_int) -> u32;
852#[cfg(windows)]
853type FpdfTextGetUnicode = unsafe extern "system" fn(FpdfTextPage, c_int) -> u32;
854#[cfg(not(windows))]
855type FpdfTextGetCharBox =
856 unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
857#[cfg(windows)]
858type FpdfTextGetCharBox =
859 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
860#[cfg(not(windows))]
861type FpdfTextGetLooseCharBox = unsafe extern "C" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
862#[cfg(windows)]
863type FpdfTextGetLooseCharBox =
864 unsafe extern "system" fn(FpdfTextPage, c_int, *mut FsRectF) -> c_int;
865#[cfg(not(windows))]
866type FpdfTextGetCharOrigin = unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
867#[cfg(windows)]
868type FpdfTextGetCharOrigin =
869 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64) -> c_int;
870#[cfg(not(windows))]
871type FpdfTextCountRects = unsafe extern "C" fn(FpdfTextPage, c_int, c_int) -> c_int;
872#[cfg(windows)]
873type FpdfTextCountRects = unsafe extern "system" fn(FpdfTextPage, c_int, c_int) -> c_int;
874#[cfg(not(windows))]
875type FpdfTextGetRect =
876 unsafe extern "C" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
877#[cfg(windows)]
878type FpdfTextGetRect =
879 unsafe extern "system" fn(FpdfTextPage, c_int, *mut f64, *mut f64, *mut f64, *mut f64) -> c_int;
880#[cfg(not(windows))]
881type FpdfTextGetFontSize = unsafe extern "C" fn(FpdfTextPage, c_int) -> f64;
882#[cfg(windows)]
883type FpdfTextGetFontSize = unsafe extern "system" fn(FpdfTextPage, c_int) -> f64;
884#[cfg(not(windows))]
885type FpdfTextGetFontInfo =
886 unsafe extern "C" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
887#[cfg(windows)]
888type FpdfTextGetFontInfo =
889 unsafe extern "system" fn(FpdfTextPage, c_int, *mut c_void, c_ulong, *mut c_int) -> c_ulong;
890#[cfg(not(windows))]
891type FpdfTextIsGenerated = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
892#[cfg(windows)]
893type FpdfTextIsGenerated = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
894#[cfg(not(windows))]
895type FpdfTextIsHyphen = unsafe extern "C" fn(FpdfTextPage, c_int) -> c_int;
896#[cfg(windows)]
897type FpdfTextIsHyphen = unsafe extern "system" fn(FpdfTextPage, c_int) -> c_int;
898#[cfg(not(windows))]
899type FpdfBitmapCreate = unsafe extern "C" fn(c_int, c_int, c_int) -> FpdfBitmap;
900#[cfg(windows)]
901type FpdfBitmapCreate = unsafe extern "system" fn(c_int, c_int, c_int) -> FpdfBitmap;
902#[cfg(not(windows))]
903type FpdfBitmapDestroy = unsafe extern "C" fn(FpdfBitmap);
904#[cfg(windows)]
905type FpdfBitmapDestroy = unsafe extern "system" fn(FpdfBitmap);
906#[cfg(not(windows))]
907type FpdfBitmapFillRect = unsafe extern "C" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
908#[cfg(windows)]
909type FpdfBitmapFillRect =
910 unsafe extern "system" fn(FpdfBitmap, c_int, c_int, c_int, c_int, c_ulong);
911#[cfg(not(windows))]
912type FpdfBitmapGetBuffer = unsafe extern "C" fn(FpdfBitmap) -> *mut c_void;
913#[cfg(windows)]
914type FpdfBitmapGetBuffer = unsafe extern "system" fn(FpdfBitmap) -> *mut c_void;
915#[cfg(not(windows))]
916type FpdfBitmapGetStride = unsafe extern "C" fn(FpdfBitmap) -> c_int;
917#[cfg(windows)]
918type FpdfBitmapGetStride = unsafe extern "system" fn(FpdfBitmap) -> c_int;
919#[cfg(not(windows))]
920type FpdfRenderPageBitmap =
921 unsafe extern "C" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
922#[cfg(windows)]
923type FpdfRenderPageBitmap =
924 unsafe extern "system" fn(FpdfBitmap, FpdfPage, c_int, c_int, c_int, c_int, c_int, c_int);
925
926#[repr(C)]
927#[derive(Clone, Copy, Debug, Default)]
928struct FsRectF {
929 left: f32,
930 top: f32,
931 right: f32,
932 bottom: f32,
933}
934
935#[derive(Clone, Copy)]
936struct PdfiumFunctions {
937 init_library: FpdfInitLibrary,
938 destroy_library: FpdfDestroyLibrary,
939 load_mem_document64: FpdfLoadMemDocument64,
940 close_document: FpdfCloseDocument,
941 get_last_error: FpdfGetLastError,
942 get_page_count: FpdfGetPageCount,
943 load_page: FpdfLoadPage,
944 close_page: FpdfClosePage,
945 get_page_width_f: FpdfGetPageWidthF,
946 get_page_height_f: FpdfGetPageHeightF,
947 page_get_rotation: Option<FpdfPageGetRotation>,
948 text_load_page: FpdfTextLoadPage,
949 text_close_page: FpdfTextClosePage,
950 text_count_chars: FpdfTextCountChars,
951 text_get_unicode: FpdfTextGetUnicode,
952 text_get_char_box: FpdfTextGetCharBox,
953 text_get_loose_char_box: Option<FpdfTextGetLooseCharBox>,
954 text_get_char_origin: Option<FpdfTextGetCharOrigin>,
955 text_count_rects: Option<FpdfTextCountRects>,
956 text_get_rect: Option<FpdfTextGetRect>,
957 text_get_font_size: FpdfTextGetFontSize,
958 text_get_font_info: Option<FpdfTextGetFontInfo>,
959 text_is_generated: Option<FpdfTextIsGenerated>,
960 text_is_hyphen: Option<FpdfTextIsHyphen>,
961 bitmap_create: Option<FpdfBitmapCreate>,
962 bitmap_destroy: Option<FpdfBitmapDestroy>,
963 bitmap_fill_rect: Option<FpdfBitmapFillRect>,
964 bitmap_get_buffer: Option<FpdfBitmapGetBuffer>,
965 bitmap_get_stride: Option<FpdfBitmapGetStride>,
966 render_page_bitmap: Option<FpdfRenderPageBitmap>,
967}
968
969impl PdfiumFunctions {
970 fn load(library: &dylib::Library) -> Result<Self, EthosError> {
971 unsafe {
974 Ok(PdfiumFunctions {
975 init_library: library.symbol(b"FPDF_InitLibrary\0")?,
976 destroy_library: library.symbol(b"FPDF_DestroyLibrary\0")?,
977 load_mem_document64: library.symbol(b"FPDF_LoadMemDocument64\0")?,
978 close_document: library.symbol(b"FPDF_CloseDocument\0")?,
979 get_last_error: library.symbol(b"FPDF_GetLastError\0")?,
980 get_page_count: library.symbol(b"FPDF_GetPageCount\0")?,
981 load_page: library.symbol(b"FPDF_LoadPage\0")?,
982 close_page: library.symbol(b"FPDF_ClosePage\0")?,
983 get_page_width_f: library.symbol(b"FPDF_GetPageWidthF\0")?,
984 get_page_height_f: library.symbol(b"FPDF_GetPageHeightF\0")?,
985 page_get_rotation: library.optional_symbol(b"FPDFPage_GetRotation\0"),
986 text_load_page: library.symbol(b"FPDFText_LoadPage\0")?,
987 text_close_page: library.symbol(b"FPDFText_ClosePage\0")?,
988 text_count_chars: library.symbol(b"FPDFText_CountChars\0")?,
989 text_get_unicode: library.symbol(b"FPDFText_GetUnicode\0")?,
990 text_get_char_box: library.symbol(b"FPDFText_GetCharBox\0")?,
991 text_get_loose_char_box: library.optional_symbol(b"FPDFText_GetLooseCharBox\0"),
992 text_get_char_origin: library.optional_symbol(b"FPDFText_GetCharOrigin\0"),
993 text_count_rects: library.optional_symbol(b"FPDFText_CountRects\0"),
994 text_get_rect: library.optional_symbol(b"FPDFText_GetRect\0"),
995 text_get_font_size: library.symbol(b"FPDFText_GetFontSize\0")?,
996 text_get_font_info: library.optional_symbol(b"FPDFText_GetFontInfo\0"),
997 text_is_generated: library.optional_symbol(b"FPDFText_IsGenerated\0"),
998 text_is_hyphen: library.optional_symbol(b"FPDFText_IsHyphen\0"),
999 bitmap_create: library.optional_symbol(b"FPDFBitmap_Create\0"),
1000 bitmap_destroy: library.optional_symbol(b"FPDFBitmap_Destroy\0"),
1001 bitmap_fill_rect: library.optional_symbol(b"FPDFBitmap_FillRect\0"),
1002 bitmap_get_buffer: library.optional_symbol(b"FPDFBitmap_GetBuffer\0"),
1003 bitmap_get_stride: library.optional_symbol(b"FPDFBitmap_GetStride\0"),
1004 render_page_bitmap: library.optional_symbol(b"FPDF_RenderPageBitmap\0"),
1005 })
1006 }
1007 }
1008
1009 fn geometry_probe_symbols(self) -> GeometryProbeSymbols {
1010 GeometryProbeSymbols {
1011 char_origin: self.text_get_char_origin.is_some(),
1012 loose_char_box: self.text_get_loose_char_box.is_some(),
1013 text_rects: self.text_count_rects.is_some() && self.text_get_rect.is_some(),
1014 }
1015 }
1016}
1017
1018struct PdfiumRuntime {
1019 _library: dylib::Library,
1020 funcs: PdfiumFunctions,
1021 initialized: bool,
1022}
1023
1024impl PdfiumRuntime {
1025 fn load(backend: &PdfiumBackend) -> Result<Self, EthosError> {
1026 let path = backend.configured_library_path().ok_or_else(|| {
1027 EthosError::internal(format!(
1028 "PDFium not found: set {PDFIUM_LIBRARY_PATH_ENV} to the caller-provided PDFium dynamic library path. {PDFIUM_SETUP_GUIDANCE}"
1029 ))
1030 })?;
1031 if !path.is_file() {
1032 return Err(EthosError::internal(format!(
1033 "pdfium library path does not point to a file. {PDFIUM_SETUP_GUIDANCE}"
1034 )));
1035 }
1036 validate_pinned_pdfium_payload(backend, &path)?;
1037
1038 let library = dylib::Library::open(&path)?;
1039 let funcs = PdfiumFunctions::load(&library)?;
1040 unsafe { (funcs.init_library)() };
1043 Ok(PdfiumRuntime {
1044 _library: library,
1045 funcs,
1046 initialized: true,
1047 })
1048 }
1049
1050 fn load_document<'a>(&'a self, pdf_bytes: &[u8]) -> Result<PdfDocument<'a>, EthosError> {
1051 let handle = unsafe {
1054 (self.funcs.load_mem_document64)(
1055 pdf_bytes.as_ptr().cast(),
1056 pdf_bytes.len(),
1057 ptr::null(),
1058 )
1059 };
1060 if handle.is_null() {
1061 let code = unsafe { (self.funcs.get_last_error)() };
1063 Err(map_pdfium_error(code))
1064 } else {
1065 Ok(PdfDocument {
1066 funcs: &self.funcs,
1067 handle,
1068 })
1069 }
1070 }
1071}
1072
1073impl Drop for PdfiumRuntime {
1074 fn drop(&mut self) {
1075 if self.initialized {
1076 unsafe { (self.funcs.destroy_library)() };
1078 }
1079 }
1080}
1081
1082struct PdfDocument<'a> {
1083 funcs: &'a PdfiumFunctions,
1084 handle: FpdfDocument,
1085}
1086
1087impl PdfDocument<'_> {
1088 fn page_count(&self) -> Result<u32, EthosError> {
1089 let count = unsafe { (self.funcs.get_page_count)(self.handle) };
1091 if count <= 0 {
1092 return Err(EthosError::new(
1093 ErrorCode::CorruptPdf,
1094 "PDF has no readable pages",
1095 ));
1096 }
1097 u32::try_from(count).map_err(|_| EthosError::internal("page count overflow"))
1098 }
1099
1100 fn load_page(&self, page_index: u32) -> Result<PdfPage<'_>, EthosError> {
1101 let index =
1102 c_int::try_from(page_index).map_err(|_| EthosError::internal("page overflow"))?;
1103 let handle = unsafe { (self.funcs.load_page)(self.handle, index) };
1105 if handle.is_null() {
1106 let code = unsafe { (self.funcs.get_last_error)() };
1108 Err(map_pdfium_error(code))
1109 } else {
1110 Ok(PdfPage {
1111 funcs: self.funcs,
1112 handle,
1113 })
1114 }
1115 }
1116}
1117
1118impl Drop for PdfDocument<'_> {
1119 fn drop(&mut self) {
1120 unsafe { (self.funcs.close_document)(self.handle) };
1122 }
1123}
1124
1125struct PdfPage<'a> {
1126 funcs: &'a PdfiumFunctions,
1127 handle: FpdfPage,
1128}
1129
1130impl PdfPage<'_> {
1131 fn width_pts(&self) -> f64 {
1132 unsafe { (self.funcs.get_page_width_f)(self.handle) as f64 }
1134 }
1135
1136 fn height_pts(&self) -> f64 {
1137 unsafe { (self.funcs.get_page_height_f)(self.handle) as f64 }
1139 }
1140
1141 fn rotation(&self) -> u16 {
1142 let Some(get_rotation) = self.funcs.page_get_rotation else {
1143 return 0;
1144 };
1145 match unsafe { get_rotation(self.handle) }.rem_euclid(4) {
1147 1 => 90,
1148 2 => 180,
1149 3 => 270,
1150 _ => 0,
1151 }
1152 }
1153
1154 fn model_page(&self, original_page: u32) -> Result<Page, EthosError> {
1155 Ok(Page {
1156 id: page_id(original_page)?,
1157 index: original_page,
1158 width: quantize_coord(self.width_pts())?,
1159 height: quantize_coord(self.height_pts())?,
1160 rotation: self.rotation(),
1161 })
1162 }
1163
1164 fn geometry_probe_page(&self, original_page: u32) -> Result<GeometryProbePage, EthosError> {
1165 let page = self.model_page(original_page)?;
1166 let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1168 if text_handle.is_null() {
1169 return Ok(GeometryProbePage {
1170 id: page.id,
1171 index: page.index,
1172 width: page.width,
1173 height: page.height,
1174 rotation: page.rotation,
1175 char_count: 0,
1176 symbols: self.funcs.geometry_probe_symbols(),
1177 chars: Vec::new(),
1178 runs: Vec::new(),
1179 });
1180 }
1181 let text_page = PdfTextPage {
1182 funcs: self.funcs,
1183 handle: text_handle,
1184 };
1185 text_page.geometry_probe(&page, self.height_pts())
1186 }
1187
1188 fn extract_text_spans(
1189 &self,
1190 page: &Page,
1191 next_span: &mut u32,
1192 spans: &mut Vec<Span>,
1193 ) -> Result<(), EthosError> {
1194 let text_handle = unsafe { (self.funcs.text_load_page)(self.handle) };
1196 if text_handle.is_null() {
1197 return Ok(());
1198 }
1199 let text_page = PdfTextPage {
1200 funcs: self.funcs,
1201 handle: text_handle,
1202 };
1203 text_page.extract_runs(page, self.height_pts(), next_span, spans)
1204 }
1205
1206 fn render_crop_raw(&self, page_index: u32, bbox: QRect) -> Result<RawCrop, EthosError> {
1207 let bitmap = RenderBitmap::render_page(
1208 self.funcs,
1209 self.handle,
1210 pixel_extent(self.width_pts())?,
1211 pixel_extent(self.height_pts())?,
1212 )?;
1213 let (x0, y0, width_px, height_px) = crop_window(bbox, bitmap.width_px, bitmap.height_px)?;
1214 let bytes = bitmap.crop_bytes(x0, y0, width_px, height_px)?;
1215 Ok(RawCrop {
1216 page_index,
1217 bbox,
1218 width_px,
1219 height_px,
1220 stride: width_px
1221 .checked_mul(4)
1222 .ok_or_else(|| EthosError::internal("crop stride overflow"))?,
1223 pixel_format: "bgra_8u",
1224 sha256: ethos_core::c14n::sha256_hex_bytes(&bytes),
1225 bytes,
1226 })
1227 }
1228}
1229
1230impl Drop for PdfPage<'_> {
1231 fn drop(&mut self) {
1232 unsafe { (self.funcs.close_page)(self.handle) };
1234 }
1235}
1236
1237struct PdfTextPage<'a> {
1238 funcs: &'a PdfiumFunctions,
1239 handle: FpdfTextPage,
1240}
1241
1242struct RenderBitmap<'a> {
1243 funcs: &'a PdfiumFunctions,
1244 handle: FpdfBitmap,
1245 width_px: u32,
1246 height_px: u32,
1247 stride: usize,
1248}
1249
1250impl RenderBitmap<'_> {
1251 fn render_page(
1252 funcs: &PdfiumFunctions,
1253 page: FpdfPage,
1254 width_px: u32,
1255 height_px: u32,
1256 ) -> Result<RenderBitmap<'_>, EthosError> {
1257 let Some(bitmap_create) = funcs.bitmap_create else {
1258 return Err(EthosError::internal(
1259 "pdfium library is missing bitmap render symbols",
1260 ));
1261 };
1262 let Some(bitmap_fill_rect) = funcs.bitmap_fill_rect else {
1263 return Err(EthosError::internal(
1264 "pdfium library is missing bitmap render symbols",
1265 ));
1266 };
1267 let Some(render_page_bitmap) = funcs.render_page_bitmap else {
1268 return Err(EthosError::internal(
1269 "pdfium library is missing bitmap render symbols",
1270 ));
1271 };
1272 let width = c_int::try_from(width_px)
1273 .map_err(|_| EthosError::internal("render bitmap width overflow"))?;
1274 let height = c_int::try_from(height_px)
1275 .map_err(|_| EthosError::internal("render bitmap height overflow"))?;
1276
1277 let handle = unsafe { bitmap_create(width, height, 1) };
1279 if handle.is_null() {
1280 return Err(EthosError::internal(
1281 "pdfium failed to allocate render bitmap",
1282 ));
1283 }
1284 let mut bitmap = RenderBitmap {
1285 funcs,
1286 handle,
1287 width_px,
1288 height_px,
1289 stride: 0,
1290 };
1291 unsafe { bitmap_fill_rect(bitmap.handle, 0, 0, width, height, 0xFFFF_FFFF) };
1293 unsafe { render_page_bitmap(bitmap.handle, page, 0, 0, width, height, 0, 0) };
1295 bitmap.stride = bitmap.read_stride()?;
1296 Ok(bitmap)
1297 }
1298
1299 fn read_stride(&self) -> Result<usize, EthosError> {
1300 let Some(bitmap_get_stride) = self.funcs.bitmap_get_stride else {
1301 return Err(EthosError::internal(
1302 "pdfium library is missing bitmap render symbols",
1303 ));
1304 };
1305 let stride = unsafe { bitmap_get_stride(self.handle) };
1307 if stride <= 0 {
1308 return Err(EthosError::internal(
1309 "pdfium render bitmap has invalid stride",
1310 ));
1311 }
1312 usize::try_from(stride).map_err(|_| EthosError::internal("render bitmap stride overflow"))
1313 }
1314
1315 fn crop_bytes(
1316 &self,
1317 x0: u32,
1318 y0: u32,
1319 width_px: u32,
1320 height_px: u32,
1321 ) -> Result<Vec<u8>, EthosError> {
1322 let Some(bitmap_get_buffer) = self.funcs.bitmap_get_buffer else {
1323 return Err(EthosError::internal(
1324 "pdfium library is missing bitmap render symbols",
1325 ));
1326 };
1327 let ptr = unsafe { bitmap_get_buffer(self.handle) };
1329 if ptr.is_null() {
1330 return Err(EthosError::internal("pdfium render bitmap has null buffer"));
1331 }
1332 let full_len = self
1333 .stride
1334 .checked_mul(
1335 usize::try_from(self.height_px)
1336 .map_err(|_| EthosError::internal("render bitmap height overflow"))?,
1337 )
1338 .ok_or_else(|| EthosError::internal("render bitmap buffer length overflow"))?;
1339 let full = unsafe { slice::from_raw_parts(ptr.cast::<u8>(), full_len) };
1341
1342 let x0 = usize::try_from(x0).map_err(|_| EthosError::internal("crop x overflow"))?;
1343 let y0 = usize::try_from(y0).map_err(|_| EthosError::internal("crop y overflow"))?;
1344 let width =
1345 usize::try_from(width_px).map_err(|_| EthosError::internal("crop width overflow"))?;
1346 let height =
1347 usize::try_from(height_px).map_err(|_| EthosError::internal("crop height overflow"))?;
1348 let row_bytes = width
1349 .checked_mul(4)
1350 .ok_or_else(|| EthosError::internal("crop row width overflow"))?;
1351 let mut out = Vec::with_capacity(
1352 row_bytes
1353 .checked_mul(height)
1354 .ok_or_else(|| EthosError::internal("crop buffer length overflow"))?,
1355 );
1356 for row in 0..height {
1357 let src_start = y0
1358 .checked_add(row)
1359 .and_then(|y| y.checked_mul(self.stride))
1360 .and_then(|base| base.checked_add(x0.checked_mul(4)?))
1361 .ok_or_else(|| EthosError::internal("crop source offset overflow"))?;
1362 let src_end = src_start
1363 .checked_add(row_bytes)
1364 .ok_or_else(|| EthosError::internal("crop source row overflow"))?;
1365 if src_end > full.len() {
1366 return Err(EthosError::internal(
1367 "crop source row exceeds render bitmap",
1368 ));
1369 }
1370 out.extend_from_slice(&full[src_start..src_end]);
1371 }
1372 Ok(out)
1373 }
1374}
1375
1376impl Drop for RenderBitmap<'_> {
1377 fn drop(&mut self) {
1378 if let Some(bitmap_destroy) = self.funcs.bitmap_destroy {
1379 unsafe { bitmap_destroy(self.handle) };
1381 }
1382 }
1383}
1384
1385impl PdfTextPage<'_> {
1386 fn geometry_probe(
1387 &self,
1388 page: &Page,
1389 page_height_pts: f64,
1390 ) -> Result<GeometryProbePage, EthosError> {
1391 let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1393 if count < 0 {
1394 return Err(EthosError::new(
1395 ErrorCode::CorruptPdf,
1396 "PDF text page could not be read",
1397 ));
1398 }
1399
1400 let mut chars = Vec::new();
1401 let mut run = GeometryRunBuilder::default();
1402 let mut runs = Vec::new();
1403 let mut next_run = 1u32;
1404 for index in 0..count {
1405 let record = self.geometry_probe_char(index, page_height_pts)?;
1406 match record.parser_action.as_str() {
1407 "include" => {
1408 if run.has_style_change(&record.font_id, record.font_size_q, record.font_flags)
1409 {
1410 run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1411 }
1412 run.push(&record);
1413 }
1414 "skip_generated_hyphen" => {}
1415 _ => run.flush(self, page_height_pts, &mut next_run, &mut runs)?,
1416 }
1417 chars.push(record);
1418 }
1419 run.flush(self, page_height_pts, &mut next_run, &mut runs)?;
1420
1421 Ok(GeometryProbePage {
1422 id: page.id.clone(),
1423 index: page.index,
1424 width: page.width,
1425 height: page.height,
1426 rotation: page.rotation,
1427 char_count: count,
1428 symbols: self.funcs.geometry_probe_symbols(),
1429 chars,
1430 runs,
1431 })
1432 }
1433
1434 fn geometry_probe_char(
1435 &self,
1436 index: c_int,
1437 page_height_pts: f64,
1438 ) -> Result<GeometryProbeChar, EthosError> {
1439 let unicode = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1441 let ch = char::from_u32(unicode);
1442 let parser_action = match ch {
1443 None => "break_invalid_unicode",
1444 Some(_) if self.is_generated_hyphen(index) => "skip_generated_hyphen",
1445 Some(ch) if should_break_text_run(ch) => "break_whitespace_or_control",
1446 Some(_) => "include",
1447 };
1448
1449 let font_info = self.font_info(index);
1450 Ok(GeometryProbeChar {
1451 index,
1452 unicode,
1453 text: ch.map(|ch| ch.to_string()),
1454 parser_action: parser_action.to_string(),
1455 char_box: self.char_bbox(index, page_height_pts)?,
1456 loose_char_box: self.loose_char_bbox(index, page_height_pts)?,
1457 char_origin: self.char_origin(index, page_height_pts)?,
1458 font_id: font_info.font_id,
1459 font_flags: font_info.font_flags,
1460 font_size_q: self.font_size_q(index),
1461 })
1462 }
1463
1464 fn extract_runs(
1465 &self,
1466 page: &Page,
1467 page_height_pts: f64,
1468 next_span: &mut u32,
1469 spans: &mut Vec<Span>,
1470 ) -> Result<(), EthosError> {
1471 let count = unsafe { (self.funcs.text_count_chars)(self.handle) };
1473 if count < 0 {
1474 return Err(EthosError::new(
1477 ErrorCode::CorruptPdf,
1478 "PDF text page could not be read",
1479 ));
1480 }
1481 if count == 0 {
1482 return Ok(());
1483 }
1484
1485 let mut run = SpanRun::default();
1486 for index in 0..count {
1487 let codepoint = unsafe { (self.funcs.text_get_unicode)(self.handle, index) };
1489 let Some(ch) = char::from_u32(codepoint) else {
1490 run.flush(page, next_span, spans)?;
1491 continue;
1492 };
1493 if self.is_generated_hyphen(index) {
1494 continue;
1495 }
1496 if should_break_text_run(ch) {
1497 run.flush(page, next_span, spans)?;
1498 continue;
1499 }
1500
1501 let Some(bbox) = self.char_bbox(index, page_height_pts)? else {
1502 run.flush(page, next_span, spans)?;
1503 continue;
1504 };
1505 let font_size_q = self.font_size_q(index);
1506 let font_info = self.font_info(index);
1507 if run.has_style_change(&font_info.font_id, font_size_q) {
1508 run.flush(page, next_span, spans)?;
1509 }
1510 let origin = self.char_origin(index, page_height_pts)?;
1511 run.push(ch, bbox, origin, font_info.font_id, font_size_q);
1512 }
1513 run.flush(page, next_span, spans)
1514 }
1515
1516 fn char_bbox(&self, index: c_int, page_height_pts: f64) -> Result<Option<QRect>, EthosError> {
1517 let mut left = 0.0f64;
1518 let mut right = 0.0f64;
1519 let mut bottom = 0.0f64;
1520 let mut top = 0.0f64;
1521 let ok = unsafe {
1523 (self.funcs.text_get_char_box)(
1524 self.handle,
1525 index,
1526 &mut left,
1527 &mut right,
1528 &mut bottom,
1529 &mut top,
1530 )
1531 };
1532 if ok == 0 {
1533 return Ok(None);
1534 }
1535 Ok(Some(qrect_from_pdfium_char_box(
1536 page_height_pts,
1537 left,
1538 right,
1539 bottom,
1540 top,
1541 )?))
1542 }
1543
1544 fn loose_char_bbox(
1545 &self,
1546 index: c_int,
1547 page_height_pts: f64,
1548 ) -> Result<Option<QRect>, EthosError> {
1549 let Some(get_loose_char_box) = self.funcs.text_get_loose_char_box else {
1550 return Ok(None);
1551 };
1552 let mut rect = FsRectF::default();
1553 let ok = unsafe { get_loose_char_box(self.handle, index, &mut rect) };
1555 if ok == 0 {
1556 return Ok(None);
1557 }
1558 Ok(Some(qrect_from_pdfium_char_box(
1559 page_height_pts,
1560 f64::from(rect.left),
1561 f64::from(rect.right),
1562 f64::from(rect.bottom),
1563 f64::from(rect.top),
1564 )?))
1565 }
1566
1567 fn char_origin(
1568 &self,
1569 index: c_int,
1570 page_height_pts: f64,
1571 ) -> Result<Option<[i64; 2]>, EthosError> {
1572 let Some(get_char_origin) = self.funcs.text_get_char_origin else {
1573 return Ok(None);
1574 };
1575 let mut x = 0.0f64;
1576 let mut y = 0.0f64;
1577 let ok = unsafe { get_char_origin(self.handle, index, &mut x, &mut y) };
1579 if ok == 0 {
1580 return Ok(None);
1581 }
1582 Ok(Some([
1583 quantize_coord(x)?,
1584 quantize_coord(page_height_pts - y)?,
1585 ]))
1586 }
1587
1588 fn text_rects(
1589 &self,
1590 char_start: c_int,
1591 char_count: c_int,
1592 page_height_pts: f64,
1593 ) -> Result<Vec<QRect>, EthosError> {
1594 let (Some(count_rects), Some(get_rect)) =
1595 (self.funcs.text_count_rects, self.funcs.text_get_rect)
1596 else {
1597 return Ok(Vec::new());
1598 };
1599 if char_count <= 0 {
1600 return Ok(Vec::new());
1601 }
1602 let rect_count = unsafe { count_rects(self.handle, char_start, char_count) };
1604 if rect_count <= 0 {
1605 return Ok(Vec::new());
1606 }
1607 let mut rects = Vec::new();
1608 for rect_index in 0..rect_count {
1609 let mut left = 0.0f64;
1610 let mut top = 0.0f64;
1611 let mut right = 0.0f64;
1612 let mut bottom = 0.0f64;
1613 let ok = unsafe {
1615 get_rect(
1616 self.handle,
1617 rect_index,
1618 &mut left,
1619 &mut top,
1620 &mut right,
1621 &mut bottom,
1622 )
1623 };
1624 if ok != 0 {
1625 rects.push(qrect_from_pdfium_char_box(
1626 page_height_pts,
1627 left,
1628 right,
1629 bottom,
1630 top,
1631 )?);
1632 }
1633 }
1634 Ok(rects)
1635 }
1636
1637 fn font_size_q(&self, index: c_int) -> Option<i64> {
1638 let size = unsafe { (self.funcs.text_get_font_size)(self.handle, index) };
1640 if size <= 0.0 {
1641 return None;
1642 }
1643 quantize(size, QUANTUM_PER_POINT).ok()
1644 }
1645
1646 fn font_info(&self, index: c_int) -> PdfFontInfo {
1647 let Some(get_font_info) = self.funcs.text_get_font_info else {
1648 return PdfFontInfo::default();
1649 };
1650 let len =
1652 unsafe { (get_font_info)(self.handle, index, ptr::null_mut(), 0, ptr::null_mut()) };
1653 if len == 0 || len > 4096 {
1654 return PdfFontInfo::default();
1655 }
1656
1657 let Ok(len_usize) = usize::try_from(len) else {
1658 return PdfFontInfo::default();
1659 };
1660 let mut buffer = vec![0u8; len_usize];
1661 let mut flags = 0;
1662 let written = unsafe {
1664 (get_font_info)(
1665 self.handle,
1666 index,
1667 buffer.as_mut_ptr().cast(),
1668 len,
1669 &mut flags,
1670 )
1671 };
1672 if written == 0 || written > len {
1673 return PdfFontInfo::default();
1674 }
1675 let nul = buffer.iter().position(|b| *b == 0).unwrap_or(buffer.len());
1676 let raw = std::str::from_utf8(&buffer[..nul]).ok();
1677 PdfFontInfo {
1678 font_id: raw.and_then(deterministic_font_id),
1679 font_flags: u32::try_from(flags).ok(),
1680 }
1681 }
1682
1683 fn is_generated_hyphen(&self, index: c_int) -> bool {
1684 let (Some(text_is_generated), Some(text_is_hyphen)) =
1685 (self.funcs.text_is_generated, self.funcs.text_is_hyphen)
1686 else {
1687 return false;
1688 };
1689 unsafe {
1691 text_is_generated(self.handle, index) == 1 && text_is_hyphen(self.handle, index) == 1
1692 }
1693 }
1694}
1695
1696impl Drop for PdfTextPage<'_> {
1697 fn drop(&mut self) {
1698 unsafe { (self.funcs.text_close_page)(self.handle) };
1700 }
1701}
1702
1703fn should_break_text_run(ch: char) -> bool {
1704 ch == '\0' || ch.is_whitespace() || ch.is_control()
1705}
1706
1707#[derive(Default)]
1708struct SpanRun {
1709 text: String,
1710 bbox: Option<QRect>,
1711 first_origin: Option<[i64; 2]>,
1712 last_origin: Option<[i64; 2]>,
1713 font_id: Option<String>,
1714 font_size_q: Option<i64>,
1715}
1716
1717#[derive(Default)]
1718struct GeometryRunBuilder {
1719 text: String,
1720 char_indices: Vec<i32>,
1721 char_box_union: Option<QRect>,
1722 loose_char_box_union: Option<QRect>,
1723 first_origin: Option<[i64; 2]>,
1724 last_origin: Option<[i64; 2]>,
1725 font_id: Option<String>,
1726 font_size_q: Option<i64>,
1727 font_flags: Option<u32>,
1728}
1729
1730#[derive(Default)]
1731struct PdfFontInfo {
1732 font_id: Option<String>,
1733 font_flags: Option<u32>,
1734}
1735
1736#[derive(Debug, Deserialize)]
1737struct FontSubstitutionTable {
1738 schema_version: String,
1739 table_id: String,
1740 version: String,
1741 default_unresolved_font_id: String,
1742 mappings: Vec<FontSubstitutionMapping>,
1743}
1744
1745#[derive(Debug, Deserialize)]
1746struct FontSubstitutionMapping {
1747 source: String,
1748 font_id: String,
1749}
1750
1751impl SpanRun {
1752 fn has_style_change(&self, font_id: &Option<String>, font_size_q: Option<i64>) -> bool {
1753 !self.text.is_empty() && (self.font_id != *font_id || self.font_size_q != font_size_q)
1754 }
1755
1756 fn push(
1757 &mut self,
1758 ch: char,
1759 bbox: QRect,
1760 origin: Option<[i64; 2]>,
1761 font_id: Option<String>,
1762 font_size_q: Option<i64>,
1763 ) {
1764 self.text.push(ch);
1765 self.bbox = Some(match self.bbox {
1766 Some(existing) => union_rect(existing, bbox),
1767 None => bbox,
1768 });
1769 if self.first_origin.is_none() {
1770 self.first_origin = origin;
1771 }
1772 self.last_origin = origin;
1773 if self.font_id.is_none() {
1774 self.font_id = font_id;
1775 }
1776 if self.font_size_q.is_none() {
1777 self.font_size_q = font_size_q;
1778 }
1779 }
1780
1781 fn flush(
1782 &mut self,
1783 page: &Page,
1784 next_span: &mut u32,
1785 spans: &mut Vec<Span>,
1786 ) -> Result<(), EthosError> {
1787 if self.text.is_empty() {
1788 return Ok(());
1789 }
1790 let bbox = self
1791 .bbox
1792 .ok_or_else(|| EthosError::internal("span run has text without bbox"))?;
1793 let origin_locator = match (self.first_origin.take(), self.last_origin.take()) {
1794 (Some(first_origin), Some(last_origin)) => Some(SpanOriginLocator {
1795 policy: ORIGIN_LOCATOR_POLICY.to_string(),
1796 first_origin,
1797 last_origin,
1798 }),
1799 _ => None,
1800 };
1801 spans.push(Span {
1802 id: span_id(*next_span)?,
1803 page: page.id.clone(),
1804 bbox,
1805 origin_locator,
1806 text: std::mem::take(&mut self.text),
1807 font_id: self.font_id.take(),
1808 font_size_q: self.font_size_q,
1809 char_start: None,
1810 char_end: None,
1811 warning_refs: Vec::new(),
1812 });
1813 *next_span += 1;
1814 self.bbox = None;
1815 self.first_origin = None;
1816 self.last_origin = None;
1817 self.font_id = None;
1818 self.font_size_q = None;
1819 Ok(())
1820 }
1821}
1822
1823impl GeometryRunBuilder {
1824 fn has_style_change(
1825 &self,
1826 font_id: &Option<String>,
1827 font_size_q: Option<i64>,
1828 font_flags: Option<u32>,
1829 ) -> bool {
1830 !self.text.is_empty()
1831 && (self.font_id != *font_id
1832 || self.font_size_q != font_size_q
1833 || self.font_flags != font_flags)
1834 }
1835
1836 fn push(&mut self, ch: &GeometryProbeChar) {
1837 if let Some(text) = &ch.text {
1838 self.text.push_str(text);
1839 }
1840 self.char_indices.push(ch.index);
1841 self.char_box_union = union_option_rect(self.char_box_union, ch.char_box);
1842 self.loose_char_box_union = union_option_rect(self.loose_char_box_union, ch.loose_char_box);
1843 if self.first_origin.is_none() {
1844 self.first_origin = ch.char_origin;
1845 }
1846 self.last_origin = ch.char_origin;
1847 if self.font_id.is_none() {
1848 self.font_id = ch.font_id.clone();
1849 }
1850 if self.font_size_q.is_none() {
1851 self.font_size_q = ch.font_size_q;
1852 }
1853 if self.font_flags.is_none() {
1854 self.font_flags = ch.font_flags;
1855 }
1856 }
1857
1858 fn flush(
1859 &mut self,
1860 text_page: &PdfTextPage<'_>,
1861 page_height_pts: f64,
1862 next_run: &mut u32,
1863 runs: &mut Vec<GeometryProbeRun>,
1864 ) -> Result<(), EthosError> {
1865 if self.text.is_empty() {
1866 return Ok(());
1867 }
1868 let char_start = self.char_indices.first().copied().unwrap_or_default();
1869 let char_end = self
1870 .char_indices
1871 .last()
1872 .copied()
1873 .map(|index| index + 1)
1874 .unwrap_or(char_start);
1875 let text_rects =
1876 text_page.text_rects(char_start, char_end - char_start, page_height_pts)?;
1877 runs.push(GeometryProbeRun {
1878 index: *next_run,
1879 text: std::mem::take(&mut self.text),
1880 char_start,
1881 char_end,
1882 char_indices: std::mem::take(&mut self.char_indices),
1883 char_box_union: self.char_box_union.take(),
1884 loose_char_box_union: self.loose_char_box_union.take(),
1885 text_rect_union: union_rects(text_rects.iter().copied()),
1886 text_rects,
1887 first_origin: self.first_origin.take(),
1888 last_origin: self.last_origin.take(),
1889 font_id: self.font_id.take(),
1890 font_flags: self.font_flags.take(),
1891 font_size_q: self.font_size_q.take(),
1892 });
1893 *next_run += 1;
1894 self.font_size_q = None;
1895 self.font_flags = None;
1896 Ok(())
1897 }
1898}
1899
1900fn union_option_rect(existing: Option<QRect>, next: Option<QRect>) -> Option<QRect> {
1901 match (existing, next) {
1902 (Some(a), Some(b)) => Some(union_rect(a, b)),
1903 (Some(a), None) => Some(a),
1904 (None, Some(b)) => Some(b),
1905 (None, None) => None,
1906 }
1907}
1908
1909fn union_rects(mut rects: impl Iterator<Item = QRect>) -> Option<QRect> {
1910 let first = rects.next()?;
1911 Some(rects.fold(first, union_rect))
1912}
1913
1914fn deterministic_font_id(raw_name: &str) -> Option<String> {
1915 let raw_name = raw_name.trim();
1916 if raw_name.is_empty() {
1917 return None;
1918 }
1919 let (name, subset) = strip_subset_prefix(raw_name);
1920 if subset {
1921 if let Some(normalized) = normalize_font_name(name) {
1922 if is_safe_font_id_suffix(&normalized) {
1923 return Some(format!("embedded:{normalized}"));
1924 }
1925 }
1926 return Some(hashed_embedded_font_id(name));
1927 }
1928 let normalized = normalize_font_name(name)?;
1929 font_substitution(&normalized)
1930 .or_else(|| Some(font_substitution_table().default_unresolved_font_id.clone()))
1931}
1932
1933fn strip_subset_prefix(name: &str) -> (&str, bool) {
1934 let bytes = name.as_bytes();
1935 if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(u8::is_ascii_uppercase) {
1936 (&name[7..], true)
1937 } else {
1938 (name, false)
1939 }
1940}
1941
1942fn normalize_font_name(name: &str) -> Option<String> {
1943 let mut out = String::new();
1944 let mut previous_dash = false;
1945 for ch in name.trim().chars() {
1946 let mapped = if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
1947 ch
1948 } else if ch.is_whitespace()
1949 || ch.is_control()
1950 || matches!(ch, '/' | '\\' | ':' | ',' | '(' | ')' | '[' | ']')
1951 {
1952 '-'
1953 } else {
1954 ch
1955 };
1956 if mapped == '-' {
1957 if previous_dash {
1958 continue;
1959 }
1960 previous_dash = true;
1961 } else {
1962 previous_dash = false;
1963 }
1964 out.push(mapped);
1965 }
1966 let out = out.trim_matches('-').to_string();
1967 (!out.is_empty()).then_some(out)
1968}
1969
1970fn is_safe_font_id_suffix(name: &str) -> bool {
1971 !name.is_empty()
1972 && name
1973 .bytes()
1974 .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b'.'))
1975}
1976
1977fn hashed_embedded_font_id(name: &str) -> String {
1978 format!(
1979 "embedded:sha256-{}",
1980 ethos_core::c14n::sha256_hex_bytes(name.as_bytes())
1981 )
1982}
1983
1984fn font_substitution(name: &str) -> Option<String> {
1985 font_substitution_table()
1986 .mappings
1987 .iter()
1988 .find(|mapping| mapping.source == name)
1989 .map(|mapping| mapping.font_id.clone())
1990}
1991
1992fn font_substitution_table() -> &'static FontSubstitutionTable {
1993 FONT_SUBSTITUTION_TABLE.get_or_init(|| {
1994 let table: FontSubstitutionTable = serde_json::from_str(FONT_SUBSTITUTION_TABLE_JSON)
1995 .expect("bundled font-substitution-table.json is valid JSON");
1996 validate_font_substitution_table(&table)
1997 .expect("bundled font-substitution-table.json is internally valid");
1998 table
1999 })
2000}
2001
2002fn validate_font_substitution_table(table: &FontSubstitutionTable) -> Result<(), &'static str> {
2003 if table.schema_version != "1.0.0"
2004 || table.table_id != "ethos-font-substitution-v1"
2005 || table.version != "1.0.0"
2006 || table.default_unresolved_font_id != "subst:liberation-sans-regular"
2007 {
2008 return Err("unexpected font substitution table metadata");
2009 }
2010
2011 let mut seen = HashSet::new();
2012 for mapping in &table.mappings {
2013 if mapping.source.is_empty() || !mapping.font_id.starts_with("subst:") {
2014 return Err("malformed font substitution mapping");
2015 }
2016 if !seen.insert(mapping.source.as_str()) {
2017 return Err("duplicate font substitution mapping source");
2018 }
2019 }
2020
2021 Ok(())
2022}
2023
2024#[cfg(unix)]
2025mod dylib {
2026 use super::*;
2027 use std::os::unix::ffi::OsStrExt;
2028
2029 const RTLD_NOW: c_int = 2;
2030
2031 unsafe extern "C" {
2032 fn dlopen(filename: *const c_char, flag: c_int) -> *mut c_void;
2033 fn dlsym(handle: *mut c_void, symbol: *const c_char) -> *mut c_void;
2034 fn dlclose(handle: *mut c_void) -> c_int;
2035 }
2036
2037 pub(super) struct Library {
2038 handle: *mut c_void,
2039 }
2040
2041 impl Library {
2042 pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2043 let c_path = CString::new(path.as_os_str().as_bytes()).map_err(|_| {
2044 EthosError::internal("pdfium library path contains an interior NUL byte")
2045 })?;
2046 let handle = unsafe { dlopen(c_path.as_ptr(), RTLD_NOW) };
2048 if handle.is_null() {
2049 Err(EthosError::internal(
2050 "failed to load configured pdfium library",
2051 ))
2052 } else {
2053 Ok(Library { handle })
2054 }
2055 }
2056
2057 pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2058 let ptr = self.symbol_ptr(name);
2059 if ptr.is_null() {
2060 return Err(EthosError::internal(format!(
2061 "pdfium library is missing symbol {}",
2062 symbol_name(name)
2063 )));
2064 }
2065 assert_symbol_pointer_size::<T>();
2066 Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2068 }
2069
2070 pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2071 let ptr = self.symbol_ptr(name);
2072 if ptr.is_null() {
2073 None
2074 } else {
2075 assert_symbol_pointer_size::<T>();
2076 Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2078 }
2079 }
2080
2081 fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2082 unsafe { dlsym(self.handle, name.as_ptr().cast()) }
2084 }
2085 }
2086
2087 impl Drop for Library {
2088 fn drop(&mut self) {
2089 if !self.handle.is_null() {
2090 unsafe {
2092 let _ = dlclose(self.handle);
2093 }
2094 }
2095 }
2096 }
2097}
2098
2099#[cfg(windows)]
2100mod dylib {
2101 use super::*;
2102 use std::os::windows::ffi::OsStrExt;
2103
2104 unsafe extern "system" {
2105 fn LoadLibraryW(lp_lib_file_name: *const u16) -> *mut c_void;
2106 fn GetProcAddress(h_module: *mut c_void, lp_proc_name: *const c_char) -> *mut c_void;
2107 fn FreeLibrary(h_lib_module: *mut c_void) -> c_int;
2108 }
2109
2110 pub(super) struct Library {
2111 handle: *mut c_void,
2112 }
2113
2114 impl Library {
2115 pub(super) fn open(path: &Path) -> Result<Self, EthosError> {
2116 let mut wide_path: Vec<u16> = path.as_os_str().encode_wide().collect();
2117 if wide_path.contains(&0) {
2118 return Err(EthosError::internal(
2119 "pdfium library path contains an interior NUL code unit",
2120 ));
2121 }
2122 wide_path.push(0);
2123 let handle = unsafe { LoadLibraryW(wide_path.as_ptr()) };
2125 if handle.is_null() {
2126 Err(EthosError::internal(
2127 "failed to load configured pdfium library",
2128 ))
2129 } else {
2130 Ok(Library { handle })
2131 }
2132 }
2133
2134 pub(super) unsafe fn symbol<T: Copy>(&self, name: &'static [u8]) -> Result<T, EthosError> {
2135 let ptr = self.symbol_ptr(name);
2136 if ptr.is_null() {
2137 return Err(EthosError::internal(format!(
2138 "pdfium library is missing symbol {}",
2139 symbol_name(name)
2140 )));
2141 }
2142 assert_symbol_pointer_size::<T>();
2143 Ok(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2145 }
2146
2147 pub(super) unsafe fn optional_symbol<T: Copy>(&self, name: &'static [u8]) -> Option<T> {
2148 let ptr = self.symbol_ptr(name);
2149 if ptr.is_null() {
2150 None
2151 } else {
2152 assert_symbol_pointer_size::<T>();
2153 Some(unsafe { std::mem::transmute_copy::<*mut c_void, T>(&ptr) })
2155 }
2156 }
2157
2158 fn symbol_ptr(&self, name: &'static [u8]) -> *mut c_void {
2159 unsafe { GetProcAddress(self.handle, name.as_ptr().cast()) }
2161 }
2162 }
2163
2164 impl Drop for Library {
2165 fn drop(&mut self) {
2166 if !self.handle.is_null() {
2167 unsafe {
2169 let _ = FreeLibrary(self.handle);
2170 }
2171 }
2172 }
2173 }
2174}
2175
2176fn assert_symbol_pointer_size<T>() {
2177 const {
2178 assert!(
2179 std::mem::size_of::<T>() == std::mem::size_of::<*mut c_void>(),
2180 "pdfium symbol pointer size mismatch"
2181 );
2182 }
2183}
2184
2185fn symbol_name(name: &'static [u8]) -> String {
2186 let name = name.strip_suffix(b"\0").unwrap_or(name);
2187 String::from_utf8_lossy(name).into_owned()
2188}
2189
2190#[cfg(test)]
2191mod tests {
2192 use super::*;
2193
2194 #[test]
2195 fn invalid_pdf_fails_before_library_load() {
2196 let err = PdfiumBackend::default()
2197 .page_count(b"not a pdf")
2198 .unwrap_err();
2199 assert_eq!(err.code, ErrorCode::InvalidPdf);
2200 }
2201
2202 #[test]
2203 fn text_run_breaks_on_pdfium_control_characters() {
2204 assert!(should_break_text_run('\0'));
2205 assert!(should_break_text_run('\n'));
2206 assert!(should_break_text_run('\u{0002}'));
2207 assert!(!should_break_text_run('-'));
2208 assert!(!should_break_text_run('A'));
2209 }
2210
2211 #[test]
2212 fn missing_library_path_is_stable_error_for_pdf_input() {
2213 let backend = PdfiumBackend::default();
2214 if env::var_os(PDFIUM_LIBRARY_PATH_ENV).is_some() {
2215 return;
2216 }
2217 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2218 assert_eq!(err.code, ErrorCode::InternalError);
2219 assert!(err.message.contains(PDFIUM_LIBRARY_PATH_ENV));
2220 assert!(err.message.contains("ethos doctor"));
2221 assert!(err.message.contains("ethos doctor --require-pdfium"));
2222 assert!(err.message.contains("docs/pdfium-manual-setup.md"));
2223 }
2224
2225 #[test]
2226 fn render_crop_raw_rejects_zero_page_before_library_load() {
2227 let err = PdfiumBackend::default()
2228 .render_crop_raw(b"%PDF-1.7\n", 0, QRect::new(0, 0, 100, 100).unwrap())
2229 .unwrap_err();
2230 assert_eq!(err.code, ErrorCode::PageLimitExceeded);
2231 assert_eq!(err.message, "page selection out of document range");
2232 }
2233
2234 #[test]
2235 fn crop_window_uses_outward_quantized_pixel_bounds() {
2236 assert_eq!(
2237 crop_window(QRect::new(7392, 5482, 19378, 7226).unwrap(), 300, 144).unwrap(),
2238 (73, 54, 121, 19)
2239 );
2240 assert_eq!(
2241 crop_window(QRect::new(-50, -50, 30100, 14500).unwrap(), 300, 144).unwrap(),
2242 (0, 0, 300, 144)
2243 );
2244
2245 let err = crop_window(QRect::new(100, 100, 101, 101).unwrap(), 1, 1).unwrap_err();
2246 assert_eq!(err.code, ErrorCode::InternalError);
2247 assert_eq!(err.message, "crop bbox has no positive pixel extent");
2248 }
2249
2250 #[test]
2251 fn render_crop_raw_is_deterministic_when_pdfium_is_configured() {
2252 let Some(path) = env::var_os(PDFIUM_LIBRARY_PATH_ENV).map(PathBuf::from) else {
2253 return;
2254 };
2255 if !path.is_file() {
2256 return;
2257 }
2258
2259 let fixture = Path::new(env!("CARGO_MANIFEST_DIR"))
2260 .join("../../fixtures/synthetic/simple-text/document.pdf");
2261 let pdf_bytes = std::fs::read(fixture).unwrap();
2262 let bbox = QRect::new(7392, 5482, 19378, 7226).unwrap();
2263 let backend = PdfiumBackend::default();
2264
2265 let first = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2266 let second = backend.render_crop_raw(&pdf_bytes, 1, bbox).unwrap();
2267
2268 assert_eq!(first, second);
2269 assert_eq!(first.page_index, 1);
2270 assert_eq!(first.bbox, bbox);
2271 assert_eq!(first.width_px, 121);
2272 assert_eq!(first.height_px, 19);
2273 assert_eq!(first.stride, first.width_px * 4);
2274 assert_eq!(first.pixel_format, "bgra_8u");
2275 assert_eq!(
2276 first.bytes.len(),
2277 usize::try_from(first.stride * first.height_px).unwrap()
2278 );
2279 assert_eq!(
2280 first.sha256,
2281 ethos_core::c14n::sha256_hex_bytes(&first.bytes)
2282 );
2283 assert!(first
2284 .bytes
2285 .chunks_exact(4)
2286 .any(|pixel| pixel != [255, 255, 255, 255]));
2287 }
2288
2289 #[test]
2290 fn invalid_configured_library_path_does_not_leak_host_path() {
2291 let path = env::temp_dir().join("ethos-missing-libpdfium\nwith-control.dylib");
2292 let backend = PdfiumBackend::from_library_path(&path);
2293 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2294 assert_eq!(err.code, ErrorCode::InternalError);
2295 assert!(err
2296 .message
2297 .contains("pdfium library path does not point to a file"));
2298 assert!(err.message.contains("ethos doctor"));
2299 assert!(err.message.contains("ethos doctor --require-pdfium"));
2300 assert!(err.message.contains("docs/pdfium-manual-setup.md"));
2301 assert!(!err.message.contains(path.to_string_lossy().as_ref()));
2302 }
2303
2304 #[test]
2305 fn explicit_manifest_hashes_library_bytes() {
2306 let path = env::temp_dir().join("ethos-test-libpdfium-hash.bin");
2307 std::fs::write(&path, b"pdfium bytes").unwrap();
2308 let backend = PdfiumBackend::from_library_path(&path).with_version("test-version");
2309 let manifest = backend.manifest();
2310 assert_eq!(manifest.id, "pdfium");
2311 assert_eq!(manifest.phase, 1);
2312 assert_eq!(manifest.version, "test-version");
2313 assert_eq!(
2314 manifest.platform_sha256,
2315 ethos_core::c14n::sha256_hex_bytes(b"pdfium bytes")
2316 );
2317 let _ = std::fs::remove_file(path);
2318 }
2319
2320 #[test]
2321 fn phase1_pdfium_profile_is_pinned_and_v8_xfa_disabled() {
2322 let profile = pinned_pdfium_profile();
2323 assert_eq!(profile.id, "pdfium");
2324 assert_eq!(profile.phase, 1);
2325 assert_eq!(profile.version, "chromium/7881");
2326 assert_eq!(profile.upstream_version, "PDFium 151.0.7881.0");
2327 assert_eq!(profile.v8, "disabled");
2328 assert_eq!(profile.xfa, "disabled");
2329 assert_eq!(profile.distribution.source, "bblanchon/pdfium-binaries");
2330 assert_eq!(
2331 profile.distribution.attestation.sha256,
2332 "24dec7cd76acb81106a0c29b908cceceef8215b050f6ff6ffbf875465811ef60"
2333 );
2334 assert!(!profile.build_flags.pdf_enable_v8);
2335 assert!(!profile.build_flags.pdf_enable_xfa);
2336 assert!(profile.build_flags.pdf_is_standalone);
2337
2338 let expected = [
2339 (
2340 "macos-arm64",
2341 "pdfium-mac-arm64.tgz",
2342 "52e94ca5aa8847934330daf3f8150c190682c5ca93831468794f8b90d4392e40",
2343 "lib/libpdfium.dylib",
2344 "1bc45b15466b34cef96641ce25c77a876e70010c6b114f909dda2f5325fc5bd7",
2345 ),
2346 (
2347 "linux-x64",
2348 "pdfium-linux-x64.tgz",
2349 "1470e21b8b4a3b4ad7f85684e2da11d94f3b69a86d81dee11b9b6709d927ac1d",
2350 "lib/libpdfium.so",
2351 "f728930966f503652b92acc89b9374a2eeca00ce42e26dccd3e4b5c5161b2d64",
2352 ),
2353 (
2354 "windows-x64",
2355 "pdfium-win-x64.tgz",
2356 "73cc0de638ac2095e7445bf56a38200a5b7c7ca0e9f4ba144598f2457377ac08",
2357 "bin/pdfium.dll",
2358 "79d4676b656cfb1abcea88f9ade3b4b0826c5200382db5f4ec72a636c598c118",
2359 ),
2360 ];
2361 for (platform, name, archive_sha256, runtime_path, runtime_sha256) in expected {
2362 assert_eq!(profile.platform_hashes[platform], archive_sha256);
2363 let artifact = &profile.platform_artifacts[platform];
2364 assert_eq!(artifact.name, name);
2365 assert!(!artifact.name.contains("-v8-"));
2366 assert!(!artifact.name.contains("xfa"));
2367 assert_eq!(artifact.runtime_library_path, runtime_path);
2368 assert_eq!(artifact.runtime_library_sha256, runtime_sha256);
2369 }
2370 }
2371
2372 #[test]
2373 fn mismatched_pdfium_version_is_rejected_before_library_load() {
2374 if current_platform_key().is_none() {
2375 return;
2376 }
2377 let path = env::temp_dir().join("ethos-test-libpdfium-version-mismatch.bin");
2378 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2379 let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7869");
2380 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2381 assert_eq!(err.code, ErrorCode::InternalError);
2382 assert_eq!(
2383 err.message,
2384 "pdfium version does not match pinned phase 1 profile"
2385 );
2386 let _ = std::fs::remove_file(path);
2387 }
2388
2389 #[test]
2390 fn pinned_upstream_pdfium_version_alias_is_accepted() {
2391 if current_platform_key().is_none() {
2392 return;
2393 }
2394 let path = env::temp_dir().join("ethos-test-libpdfium-upstream-version.bin");
2395 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2396 let backend = PdfiumBackend::from_library_path(&path).with_version("PDFium 151.0.7881.0");
2397 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2398 assert_eq!(err.code, ErrorCode::InternalError);
2399 assert_eq!(
2400 err.message,
2401 "pdfium library does not match pinned phase 1 profile"
2402 );
2403 let _ = std::fs::remove_file(path);
2404 }
2405
2406 #[test]
2407 fn mismatched_pdfium_artifact_is_rejected_with_stable_error() {
2408 if current_platform_key().is_none() {
2409 return;
2410 }
2411 let library_path = env::temp_dir().join("ethos-test-libpdfium-artifact-mismatch.bin");
2412 let artifact_path = env::temp_dir().join("ethos-test-pdfium-artifact-mismatch.tgz");
2413 std::fs::write(&library_path, b"not the pinned pdfium library").unwrap();
2414 std::fs::write(&artifact_path, b"not the pinned pdfium artifact").unwrap();
2415 let backend = PdfiumBackend::from_library_path(&library_path)
2416 .with_version("chromium/7881")
2417 .with_artifact_path(&artifact_path);
2418 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2419 assert_eq!(err.code, ErrorCode::InternalError);
2420 assert_eq!(
2421 err.message,
2422 "pdfium artifact does not match pinned phase 1 profile"
2423 );
2424 let _ = std::fs::remove_file(library_path);
2425 let _ = std::fs::remove_file(artifact_path);
2426 }
2427
2428 #[test]
2429 fn mismatched_pdfium_library_is_rejected_before_dynamic_load() {
2430 if current_platform_key().is_none() {
2431 return;
2432 }
2433 let path = env::temp_dir().join("ethos-test-libpdfium-library-mismatch.bin");
2434 std::fs::write(&path, b"not the pinned pdfium library").unwrap();
2435 let backend = PdfiumBackend::from_library_path(&path).with_version("chromium/7881");
2436 let err = backend.page_count(b"%PDF-1.7\n").unwrap_err();
2437 assert_eq!(err.code, ErrorCode::InternalError);
2438 assert_eq!(
2439 err.message,
2440 "pdfium library does not match pinned phase 1 profile"
2441 );
2442 let _ = std::fs::remove_file(path);
2443 }
2444
2445 #[test]
2446 fn deterministic_font_ids_strip_subset_prefixes() {
2447 assert_eq!(
2448 deterministic_font_id("ABCDEF+MinionPro-Regular").as_deref(),
2449 Some("embedded:MinionPro-Regular")
2450 );
2451 assert_eq!(
2452 deterministic_font_id("Helvetica-Bold").as_deref(),
2453 Some("subst:liberation-sans-bold")
2454 );
2455 assert_eq!(
2456 deterministic_font_id("Helvetica").as_deref(),
2457 Some("subst:liberation-sans-regular")
2458 );
2459 assert_eq!(
2460 deterministic_font_id("Helvetica-Oblique").as_deref(),
2461 Some("subst:liberation-sans-italic")
2462 );
2463 assert_eq!(
2464 deterministic_font_id("Helvetica-BoldOblique").as_deref(),
2465 Some("subst:liberation-sans-bold-italic")
2466 );
2467 assert_eq!(
2468 deterministic_font_id("Courier").as_deref(),
2469 Some("subst:liberation-mono-regular")
2470 );
2471 assert_eq!(
2472 deterministic_font_id("Times-Roman").as_deref(),
2473 Some("subst:liberation-serif-regular")
2474 );
2475 assert_eq!(
2476 deterministic_font_id("Custom Font/Regular").as_deref(),
2477 Some("subst:liberation-sans-regular")
2478 );
2479 assert_eq!(deterministic_font_id(" "), None);
2480 }
2481
2482 #[test]
2483 fn deterministic_font_ids_keep_embedded_ids_ascii_only() {
2484 let unsafe_unicode = deterministic_font_id("ABCDEF+明朝").unwrap();
2485 assert_eq!(unsafe_unicode, hashed_embedded_font_id("明朝"));
2486 assert!(unsafe_unicode.is_ascii());
2487
2488 let unsafe_punctuation = deterministic_font_id("ABCDEF+Fixture+Font").unwrap();
2489 assert_eq!(unsafe_punctuation, hashed_embedded_font_id("Fixture+Font"));
2490 assert!(unsafe_punctuation.is_ascii());
2491
2492 let separator_only = deterministic_font_id("ABCDEF+///").unwrap();
2493 assert_eq!(separator_only, hashed_embedded_font_id("///"));
2494 assert!(separator_only.is_ascii());
2495
2496 assert_eq!(
2497 deterministic_font_id("明朝").as_deref(),
2498 Some("subst:liberation-sans-regular")
2499 );
2500 }
2501
2502 #[test]
2503 fn font_substitution_table_is_well_formed() {
2504 use std::collections::HashSet;
2505
2506 let table = font_substitution_table();
2507 assert_eq!(table.schema_version, "1.0.0");
2508 assert_eq!(table.table_id, "ethos-font-substitution-v1");
2509 assert_eq!(table.version, "1.0.0");
2510 assert_eq!(
2511 table.default_unresolved_font_id,
2512 "subst:liberation-sans-regular"
2513 );
2514
2515 let mut seen = HashSet::new();
2516 for mapping in &table.mappings {
2517 assert!(!mapping.source.is_empty());
2518 assert!(mapping.font_id.starts_with("subst:"));
2519 assert!(
2520 seen.insert(mapping.source.as_str()),
2521 "duplicate font substitution source {}",
2522 mapping.source
2523 );
2524 }
2525 assert_eq!(table.mappings.len(), 14);
2526 }
2527
2528 #[test]
2529 fn profile_pins_font_substitution_table_bytes() {
2530 const FONT_SUBSTITUTION_TABLE_PATH: &str =
2531 "crates/ethos-pdf/assets/font-substitution-table.json";
2532 let profile: serde_json::Value = serde_json::from_str(include_str!(concat!(
2533 env!("CARGO_MANIFEST_DIR"),
2534 "/../../profiles/ethos-deterministic-v1.json"
2535 )))
2536 .unwrap();
2537 let pin = &profile["font_policy"]["substitution_table"];
2538 assert_eq!(pin["path"], FONT_SUBSTITUTION_TABLE_PATH);
2539 assert_eq!(
2540 pin["sha256"],
2541 ethos_core::c14n::sha256_hex_bytes(FONT_SUBSTITUTION_TABLE_JSON.as_bytes())
2542 );
2543 }
2544}