Skip to main content

ethos_core/
traits.rs

1/*
2 * Copyright 2026 The Ethos maintainers
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//! Crate boundaries (Milestone A skeleton): the backend trait (`EthosPdfBackend`) and the
18//! layout trait. Invariant 3: only `ethos-pdf` implements the backend; public schemas and
19//! APIs never expose PDFium types — everything crossing this boundary is already
20//! normalized and quantized (invariant 1).
21
22use serde::{Deserialize, Serialize};
23
24use crate::config::ParseConfig;
25use crate::error::EthosError;
26use crate::model::{Element, Page, Region, Span, Warning};
27
28/// Backend build identity — pinned into the deterministic profile (ADR-0002) and thereby
29/// into every document fingerprint.
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31pub struct BackendManifest {
32    /// Backend id (`"pdfium"`).
33    pub id: String,
34    /// Distribution phase (1 = pinned bblanchon binaries, 2 = project-maintained builds).
35    pub phase: u8,
36    /// Backend version/release string.
37    pub version: String,
38    /// Per-platform artifact sha256 for the running platform.
39    pub platform_sha256: String,
40}
41
42/// Extraction output: the normalized, quantized data that leaves the backend.
43/// No raw `f64` geometry, no backend-native types (invariants 1 + 3).
44#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
45pub struct Extraction {
46    /// Pages in ascending original index (already filtered by page selection —
47    /// filtering happens at the backend boundary, PRD §16).
48    pub pages: Vec<Page>,
49    /// Spans in normalized content-stream order, quantized.
50    pub spans: Vec<Span>,
51    /// Raw non-text regions (pre-classification), stable coordinates.
52    pub regions: Vec<Region>,
53    /// Warnings emitted during extraction (numbered later per contract §5).
54    pub warnings: Vec<Warning>,
55}
56
57/// The sole PDF backend boundary. Implementations live in `ethos-pdf` only.
58pub trait EthosPdfBackend {
59    /// Build identity for the profile manifest.
60    fn manifest(&self) -> BackendManifest;
61
62    /// Page count after ingest validation (encryption/corruption checks happen here
63    /// and fail with stable codes).
64    fn page_count(&self, pdf_bytes: &[u8]) -> Result<u32, EthosError>;
65
66    /// Extract pages/spans/regions under the given config. Page-range filtering is a
67    /// backend responsibility (`config.pages`); geometry arrives quantized.
68    fn extract(&self, pdf_bytes: &[u8], config: &ParseConfig) -> Result<Extraction, EthosError>;
69}
70
71/// Layout output: the element graph in reading order (Milestone B fills this in).
72#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
73pub struct LayoutOutput {
74    /// Elements in reading order.
75    pub elements: Vec<Element>,
76    /// Layout-stage warnings.
77    pub warnings: Vec<Warning>,
78}
79
80/// The layout boundary: consumes extraction (already quantized), produces the element
81/// graph. Implementations live in `ethos-layout` (Milestone B, WS-LAYOUT).
82pub trait LayoutEngine {
83    /// Compute reading order, blocks, headings, lists.
84    fn layout(&self, extraction: &Extraction) -> Result<LayoutOutput, EthosError>;
85}