Skip to main content

spdfdiff_types/
lib.rs

1use serde::{Deserialize, Serialize};
2use thiserror::Error;
3
4pub const DIFF_SCHEMA_VERSION: &str = "0.1.0";
5
6#[derive(Debug, Error, Clone, PartialEq, Eq)]
7pub enum PdfDiffError {
8    #[error("input exceeds configured resource limit: {0}")]
9    ResourceLimitExceeded(String),
10    #[error("input is not a supported PDF: {0}")]
11    UnsupportedPdf(String),
12    #[error("invalid input: {0}")]
13    InvalidInput(String),
14    #[error("internal invariant failed: {0}")]
15    InternalInvariant(String),
16}
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
19pub struct ObjectId {
20    pub number: u32,
21    pub generation: u16,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25pub struct ByteRange {
26    pub start: usize,
27    pub end: usize,
28}
29
30impl ByteRange {
31    #[must_use]
32    pub const fn new(start: usize, end: usize) -> Self {
33        Self { start, end }
34    }
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38pub enum FileRole {
39    Old,
40    New,
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub struct Provenance {
45    pub file_role: Option<FileRole>,
46    pub object_id: Option<ObjectId>,
47    pub page_index: Option<usize>,
48    pub stream_object_id: Option<ObjectId>,
49    pub content_op_index: Option<usize>,
50    pub byte_range: Option<ByteRange>,
51}
52
53impl Provenance {
54    #[must_use]
55    pub const fn unknown() -> Self {
56        Self {
57            file_role: None,
58            object_id: None,
59            page_index: None,
60            stream_object_id: None,
61            content_op_index: None,
62            byte_range: None,
63        }
64    }
65}
66
67#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
68pub struct Point {
69    pub x: f32,
70    pub y: f32,
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
74pub struct Rect {
75    pub x0: f32,
76    pub y0: f32,
77    pub x1: f32,
78    pub y1: f32,
79}
80
81impl Rect {
82    #[must_use]
83    pub fn width(self) -> f32 {
84        self.x1 - self.x0
85    }
86
87    #[must_use]
88    pub fn height(self) -> f32 {
89        self.y1 - self.y0
90    }
91}
92
93#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
94pub struct Matrix {
95    pub a: f32,
96    pub b: f32,
97    pub c: f32,
98    pub d: f32,
99    pub e: f32,
100    pub f: f32,
101}
102
103impl Matrix {
104    pub const IDENTITY: Self = Self {
105        a: 1.0,
106        b: 0.0,
107        c: 0.0,
108        d: 1.0,
109        e: 0.0,
110        f: 0.0,
111    };
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
115pub struct LineSegment {
116    pub start: Point,
117    pub end: Point,
118}
119
120#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
121pub enum DiagnosticSeverity {
122    Info,
123    Warning,
124    Error,
125}
126
127#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
128pub struct Diagnostic {
129    pub severity: DiagnosticSeverity,
130    pub code: String,
131    pub message: String,
132    pub object: Option<ObjectId>,
133    pub page_index: Option<usize>,
134}
135
136impl Diagnostic {
137    #[must_use]
138    pub fn new(
139        severity: DiagnosticSeverity,
140        code: impl Into<String>,
141        message: impl Into<String>,
142    ) -> Self {
143        Self {
144            severity,
145            code: code.into(),
146            message: message.into(),
147            object: None,
148            page_index: None,
149        }
150    }
151
152    #[must_use]
153    pub fn info(code: impl Into<String>, message: impl Into<String>) -> Self {
154        Self::new(DiagnosticSeverity::Info, code, message)
155    }
156
157    #[must_use]
158    pub fn warning(code: impl Into<String>, message: impl Into<String>) -> Self {
159        Self::new(DiagnosticSeverity::Warning, code, message)
160    }
161
162    #[must_use]
163    pub fn error(code: impl Into<String>, message: impl Into<String>) -> Self {
164        Self::new(DiagnosticSeverity::Error, code, message)
165    }
166}
167
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
169pub struct ResourceLimits {
170    pub max_file_bytes: usize,
171    pub max_objects: usize,
172    pub max_indirect_depth: usize,
173    pub max_stream_bytes: usize,
174    pub max_decoded_stream_bytes: usize,
175    pub max_content_ops_per_page: usize,
176    pub max_pages: usize,
177}
178
179impl Default for ResourceLimits {
180    fn default() -> Self {
181        Self {
182            max_file_bytes: 100 * 1024 * 1024,
183            max_objects: 250_000,
184            max_indirect_depth: 64,
185            max_stream_bytes: 50 * 1024 * 1024,
186            max_decoded_stream_bytes: 200 * 1024 * 1024,
187            max_content_ops_per_page: 1_000_000,
188            max_pages: 10_000,
189        }
190    }
191}
192
193impl ResourceLimits {
194    pub fn check_file_size(self, byte_len: usize) -> Result<(), PdfDiffError> {
195        if byte_len > self.max_file_bytes {
196            return Err(PdfDiffError::ResourceLimitExceeded(format!(
197                "file has {byte_len} bytes, limit is {}",
198                self.max_file_bytes
199            )));
200        }
201
202        Ok(())
203    }
204}
205
206#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
207pub struct ParseConfig {
208    pub limits: ResourceLimits,
209}
210
211#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
212pub enum ChangeKind {
213    Inserted,
214    Deleted,
215    Modified,
216    Moved,
217    LayoutChanged,
218    StyleChanged,
219    MetadataChanged,
220    AnnotationChanged,
221    FormFieldChanged,
222    ObjectChanged,
223    Unknown,
224}
225
226#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
227pub enum ChangeSeverity {
228    Critical,
229    Major,
230    Minor,
231    Info,
232}
233
234#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
235pub struct DiffSummary {
236    pub inserted: usize,
237    pub deleted: usize,
238    pub modified: usize,
239    pub moved: usize,
240    pub layout_changed: usize,
241}
242
243#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
244pub struct SemanticNodeEvidence {
245    pub node_id: String,
246    #[serde(default, skip_serializing_if = "Option::is_none")]
247    pub semantic_role: Option<String>,
248    pub page: usize,
249    pub bbox: Option<Rect>,
250    pub text: Option<String>,
251    pub source: Vec<Provenance>,
252}
253
254#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
255pub struct LayoutDiff {
256    pub old_bbox: Option<Rect>,
257    pub new_bbox: Option<Rect>,
258    #[serde(default, skip_serializing_if = "Option::is_none")]
259    pub delta_x: Option<f32>,
260    #[serde(default, skip_serializing_if = "Option::is_none")]
261    pub delta_y: Option<f32>,
262    #[serde(default, skip_serializing_if = "Option::is_none")]
263    pub delta_width: Option<f32>,
264    #[serde(default, skip_serializing_if = "Option::is_none")]
265    pub delta_height: Option<f32>,
266    pub page_changed: bool,
267    pub reading_order_changed: bool,
268}
269
270#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
271pub enum TextHunkKind {
272    Equal,
273    Inserted,
274    Deleted,
275    Replaced,
276}
277
278#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
279pub struct TextRange {
280    pub start: usize,
281    pub end: usize,
282}
283
284impl TextRange {
285    #[must_use]
286    pub const fn new(start: usize, end: usize) -> Self {
287        Self { start, end }
288    }
289}
290
291#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
292pub enum TextHunkGranularity {
293    Token,
294    Character,
295}
296
297#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
298pub struct TextHunk {
299    pub kind: TextHunkKind,
300    #[serde(default, skip_serializing_if = "Option::is_none")]
301    pub granularity: Option<TextHunkGranularity>,
302    #[serde(default, skip_serializing_if = "Option::is_none")]
303    pub old_range: Option<TextRange>,
304    #[serde(default, skip_serializing_if = "Option::is_none")]
305    pub new_range: Option<TextRange>,
306    pub old_text: Option<String>,
307    pub new_text: Option<String>,
308}
309
310#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
311pub struct SemanticChange {
312    pub id: String,
313    pub kind: ChangeKind,
314    pub severity: ChangeSeverity,
315    pub old_node: Option<SemanticNodeEvidence>,
316    pub new_node: Option<SemanticNodeEvidence>,
317    #[serde(default, skip_serializing_if = "Vec::is_empty")]
318    pub text_hunks: Vec<TextHunk>,
319    #[serde(default, skip_serializing_if = "Option::is_none")]
320    pub layout_diff: Option<LayoutDiff>,
321    pub confidence: f32,
322    pub reason: String,
323}
324
325#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
326pub struct DiffDocument {
327    pub schema_version: String,
328    pub old_fingerprint: String,
329    pub new_fingerprint: String,
330    pub summary: DiffSummary,
331    pub changes: Vec<SemanticChange>,
332    pub diagnostics: Vec<Diagnostic>,
333}
334
335#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
336pub struct AiReviewReport {
337    pub schema_version: String,
338    pub source_schema_version: String,
339    pub old_fingerprint: String,
340    pub new_fingerprint: String,
341    pub summary: AiReviewSummary,
342    pub question_hints: Vec<AiReviewQuestionHint>,
343    pub review_items: Vec<AiReviewItem>,
344    pub diagnostic_summary: Vec<AiDiagnosticCount>,
345}
346
347#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
348pub struct AiReviewSummary {
349    pub total_changes: usize,
350    pub inserted: usize,
351    pub deleted: usize,
352    pub modified: usize,
353    pub moved: usize,
354    pub layout_changed: usize,
355    pub diagnostic_count: usize,
356    pub low_confidence_change_count: usize,
357    pub unsupported_surface_count: usize,
358}
359
360#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
361pub struct AiReviewQuestionHint {
362    pub question: String,
363    pub answer: AiReviewAnswer,
364    pub supporting_change_ids: Vec<String>,
365    pub rationale: String,
366}
367
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
369pub enum AiReviewAnswer {
370    Yes,
371    No,
372    Unknown,
373}
374
375#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
376pub struct AiReviewItem {
377    pub change_id: String,
378    pub kind: ChangeKind,
379    pub severity: ChangeSeverity,
380    pub confidence: f32,
381    pub confidence_bucket: AiConfidenceBucket,
382    pub tags: Vec<AiReviewTag>,
383    pub explanation: String,
384    pub evidence: AiEvidenceBundle,
385}
386
387#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
388pub enum AiConfidenceBucket {
389    High,
390    Medium,
391    Low,
392}
393
394#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
395pub enum AiReviewTag {
396    TextChanged,
397    ContentInserted,
398    ContentDeleted,
399    ContentMoved,
400    LayoutOnly,
401    RepeatedPageRegion,
402    PaymentTermsCandidate,
403    DateOrDurationCandidate,
404    PartyNameCandidate,
405    NumericValueChanged,
406    AnnotationOrLinkChanged,
407    FormFieldChanged,
408    MetadataChanged,
409    VisualSurfaceChanged,
410    UnsupportedSurface,
411    LowConfidence,
412}
413
414#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
415pub struct AiEvidenceBundle {
416    pub old_node_id: Option<String>,
417    pub new_node_id: Option<String>,
418    #[serde(default, skip_serializing_if = "Option::is_none")]
419    pub old_semantic_role: Option<String>,
420    #[serde(default, skip_serializing_if = "Option::is_none")]
421    pub new_semantic_role: Option<String>,
422    pub section_hint: Option<String>,
423    pub old_page: Option<usize>,
424    pub new_page: Option<usize>,
425    pub old_bbox: Option<Rect>,
426    pub new_bbox: Option<Rect>,
427    pub old_text: Option<String>,
428    pub new_text: Option<String>,
429    pub text_hunks: Vec<TextHunk>,
430    #[serde(default, skip_serializing_if = "Option::is_none")]
431    pub layout_diff: Option<LayoutDiff>,
432    pub provenance: Vec<Provenance>,
433}
434
435#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
436pub struct AiDiagnosticCount {
437    pub code: String,
438    pub count: usize,
439}
440
441impl DiffDocument {
442    #[must_use]
443    pub fn empty(old_fingerprint: impl Into<String>, new_fingerprint: impl Into<String>) -> Self {
444        Self {
445            schema_version: DIFF_SCHEMA_VERSION.to_owned(),
446            old_fingerprint: old_fingerprint.into(),
447            new_fingerprint: new_fingerprint.into(),
448            summary: DiffSummary::default(),
449            changes: Vec::new(),
450            diagnostics: Vec::new(),
451        }
452    }
453}