1use serde::{Deserialize, Serialize};
2use thiserror::Error;
3
4pub const DIFF_SCHEMA_VERSION: &str = "0.1.0";
5
6#[derive(Debug, Error, Clone, PartialEq, Eq)]
7pub enum PdfDiffError {
8 #[error("input exceeds configured resource limit: {0}")]
9 ResourceLimitExceeded(String),
10 #[error("input is not a supported PDF: {0}")]
11 UnsupportedPdf(String),
12 #[error("invalid input: {0}")]
13 InvalidInput(String),
14 #[error("internal invariant failed: {0}")]
15 InternalInvariant(String),
16}
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
19pub struct ObjectId {
20 pub number: u32,
21 pub generation: u16,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25pub struct ByteRange {
26 pub start: usize,
27 pub end: usize,
28}
29
30impl ByteRange {
31 #[must_use]
32 pub const fn new(start: usize, end: usize) -> Self {
33 Self { start, end }
34 }
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
38pub enum FileRole {
39 Old,
40 New,
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub struct Provenance {
45 pub file_role: Option<FileRole>,
46 pub object_id: Option<ObjectId>,
47 pub page_index: Option<usize>,
48 pub stream_object_id: Option<ObjectId>,
49 pub content_op_index: Option<usize>,
50 pub byte_range: Option<ByteRange>,
51}
52
53impl Provenance {
54 #[must_use]
55 pub const fn unknown() -> Self {
56 Self {
57 file_role: None,
58 object_id: None,
59 page_index: None,
60 stream_object_id: None,
61 content_op_index: None,
62 byte_range: None,
63 }
64 }
65}
66
67#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
68pub struct Point {
69 pub x: f32,
70 pub y: f32,
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
74pub struct Rect {
75 pub x0: f32,
76 pub y0: f32,
77 pub x1: f32,
78 pub y1: f32,
79}
80
81impl Rect {
82 #[must_use]
83 pub fn width(self) -> f32 {
84 self.x1 - self.x0
85 }
86
87 #[must_use]
88 pub fn height(self) -> f32 {
89 self.y1 - self.y0
90 }
91}
92
93#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
94pub struct Matrix {
95 pub a: f32,
96 pub b: f32,
97 pub c: f32,
98 pub d: f32,
99 pub e: f32,
100 pub f: f32,
101}
102
103impl Matrix {
104 pub const IDENTITY: Self = Self {
105 a: 1.0,
106 b: 0.0,
107 c: 0.0,
108 d: 1.0,
109 e: 0.0,
110 f: 0.0,
111 };
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
115pub struct LineSegment {
116 pub start: Point,
117 pub end: Point,
118}
119
120#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
121pub enum DiagnosticSeverity {
122 Info,
123 Warning,
124 Error,
125}
126
127#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
128pub struct Diagnostic {
129 pub severity: DiagnosticSeverity,
130 pub code: String,
131 pub message: String,
132 pub object: Option<ObjectId>,
133 pub page_index: Option<usize>,
134}
135
136impl Diagnostic {
137 #[must_use]
138 pub fn new(
139 severity: DiagnosticSeverity,
140 code: impl Into<String>,
141 message: impl Into<String>,
142 ) -> Self {
143 Self {
144 severity,
145 code: code.into(),
146 message: message.into(),
147 object: None,
148 page_index: None,
149 }
150 }
151
152 #[must_use]
153 pub fn info(code: impl Into<String>, message: impl Into<String>) -> Self {
154 Self::new(DiagnosticSeverity::Info, code, message)
155 }
156
157 #[must_use]
158 pub fn warning(code: impl Into<String>, message: impl Into<String>) -> Self {
159 Self::new(DiagnosticSeverity::Warning, code, message)
160 }
161
162 #[must_use]
163 pub fn error(code: impl Into<String>, message: impl Into<String>) -> Self {
164 Self::new(DiagnosticSeverity::Error, code, message)
165 }
166}
167
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
169pub struct ResourceLimits {
170 pub max_file_bytes: usize,
171 pub max_objects: usize,
172 pub max_indirect_depth: usize,
173 pub max_stream_bytes: usize,
174 pub max_decoded_stream_bytes: usize,
175 pub max_content_ops_per_page: usize,
176 pub max_pages: usize,
177}
178
179impl Default for ResourceLimits {
180 fn default() -> Self {
181 Self {
182 max_file_bytes: 100 * 1024 * 1024,
183 max_objects: 250_000,
184 max_indirect_depth: 64,
185 max_stream_bytes: 50 * 1024 * 1024,
186 max_decoded_stream_bytes: 200 * 1024 * 1024,
187 max_content_ops_per_page: 1_000_000,
188 max_pages: 10_000,
189 }
190 }
191}
192
193impl ResourceLimits {
194 pub fn check_file_size(self, byte_len: usize) -> Result<(), PdfDiffError> {
195 if byte_len > self.max_file_bytes {
196 return Err(PdfDiffError::ResourceLimitExceeded(format!(
197 "file has {byte_len} bytes, limit is {}",
198 self.max_file_bytes
199 )));
200 }
201
202 Ok(())
203 }
204}
205
206#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
207pub struct ParseConfig {
208 pub limits: ResourceLimits,
209}
210
211#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
212pub enum ChangeKind {
213 Inserted,
214 Deleted,
215 Modified,
216 Moved,
217 LayoutChanged,
218 StyleChanged,
219 MetadataChanged,
220 AnnotationChanged,
221 FormFieldChanged,
222 ObjectChanged,
223 Unknown,
224}
225
226#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
227pub enum ChangeSeverity {
228 Critical,
229 Major,
230 Minor,
231 Info,
232}
233
234#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
235pub struct DiffSummary {
236 pub inserted: usize,
237 pub deleted: usize,
238 pub modified: usize,
239 pub moved: usize,
240 pub layout_changed: usize,
241}
242
243#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
244pub struct SemanticNodeEvidence {
245 pub node_id: String,
246 #[serde(default, skip_serializing_if = "Option::is_none")]
247 pub semantic_role: Option<String>,
248 pub page: usize,
249 pub bbox: Option<Rect>,
250 pub text: Option<String>,
251 pub source: Vec<Provenance>,
252}
253
254#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
255pub struct LayoutDiff {
256 pub old_bbox: Option<Rect>,
257 pub new_bbox: Option<Rect>,
258 #[serde(default, skip_serializing_if = "Option::is_none")]
259 pub delta_x: Option<f32>,
260 #[serde(default, skip_serializing_if = "Option::is_none")]
261 pub delta_y: Option<f32>,
262 #[serde(default, skip_serializing_if = "Option::is_none")]
263 pub delta_width: Option<f32>,
264 #[serde(default, skip_serializing_if = "Option::is_none")]
265 pub delta_height: Option<f32>,
266 pub page_changed: bool,
267 pub reading_order_changed: bool,
268}
269
270#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
271pub enum TextHunkKind {
272 Equal,
273 Inserted,
274 Deleted,
275 Replaced,
276}
277
278#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
279pub struct TextRange {
280 pub start: usize,
281 pub end: usize,
282}
283
284impl TextRange {
285 #[must_use]
286 pub const fn new(start: usize, end: usize) -> Self {
287 Self { start, end }
288 }
289}
290
291#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
292pub enum TextHunkGranularity {
293 Token,
294 Character,
295}
296
297#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
298pub struct TextHunk {
299 pub kind: TextHunkKind,
300 #[serde(default, skip_serializing_if = "Option::is_none")]
301 pub granularity: Option<TextHunkGranularity>,
302 #[serde(default, skip_serializing_if = "Option::is_none")]
303 pub old_range: Option<TextRange>,
304 #[serde(default, skip_serializing_if = "Option::is_none")]
305 pub new_range: Option<TextRange>,
306 pub old_text: Option<String>,
307 pub new_text: Option<String>,
308}
309
310#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
311pub struct SemanticChange {
312 pub id: String,
313 pub kind: ChangeKind,
314 pub severity: ChangeSeverity,
315 pub old_node: Option<SemanticNodeEvidence>,
316 pub new_node: Option<SemanticNodeEvidence>,
317 #[serde(default, skip_serializing_if = "Vec::is_empty")]
318 pub text_hunks: Vec<TextHunk>,
319 #[serde(default, skip_serializing_if = "Option::is_none")]
320 pub layout_diff: Option<LayoutDiff>,
321 pub confidence: f32,
322 pub reason: String,
323}
324
325#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
326pub struct DiffDocument {
327 pub schema_version: String,
328 pub old_fingerprint: String,
329 pub new_fingerprint: String,
330 pub summary: DiffSummary,
331 pub changes: Vec<SemanticChange>,
332 pub diagnostics: Vec<Diagnostic>,
333}
334
335#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
336pub struct AiReviewReport {
337 pub schema_version: String,
338 pub source_schema_version: String,
339 pub old_fingerprint: String,
340 pub new_fingerprint: String,
341 pub summary: AiReviewSummary,
342 pub question_hints: Vec<AiReviewQuestionHint>,
343 pub review_items: Vec<AiReviewItem>,
344 pub diagnostic_summary: Vec<AiDiagnosticCount>,
345}
346
347#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
348pub struct AiReviewSummary {
349 pub total_changes: usize,
350 pub inserted: usize,
351 pub deleted: usize,
352 pub modified: usize,
353 pub moved: usize,
354 pub layout_changed: usize,
355 pub diagnostic_count: usize,
356 pub low_confidence_change_count: usize,
357 pub unsupported_surface_count: usize,
358}
359
360#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
361pub struct AiReviewQuestionHint {
362 pub question: String,
363 pub answer: AiReviewAnswer,
364 pub supporting_change_ids: Vec<String>,
365 pub rationale: String,
366}
367
368#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
369pub enum AiReviewAnswer {
370 Yes,
371 No,
372 Unknown,
373}
374
375#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
376pub struct AiReviewItem {
377 pub change_id: String,
378 pub kind: ChangeKind,
379 pub severity: ChangeSeverity,
380 pub confidence: f32,
381 pub confidence_bucket: AiConfidenceBucket,
382 pub tags: Vec<AiReviewTag>,
383 pub explanation: String,
384 pub evidence: AiEvidenceBundle,
385}
386
387#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
388pub enum AiConfidenceBucket {
389 High,
390 Medium,
391 Low,
392}
393
394#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
395pub enum AiReviewTag {
396 TextChanged,
397 ContentInserted,
398 ContentDeleted,
399 ContentMoved,
400 LayoutOnly,
401 RepeatedPageRegion,
402 PaymentTermsCandidate,
403 DateOrDurationCandidate,
404 PartyNameCandidate,
405 NumericValueChanged,
406 AnnotationOrLinkChanged,
407 FormFieldChanged,
408 MetadataChanged,
409 VisualSurfaceChanged,
410 UnsupportedSurface,
411 LowConfidence,
412}
413
414#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
415pub struct AiEvidenceBundle {
416 pub old_node_id: Option<String>,
417 pub new_node_id: Option<String>,
418 #[serde(default, skip_serializing_if = "Option::is_none")]
419 pub old_semantic_role: Option<String>,
420 #[serde(default, skip_serializing_if = "Option::is_none")]
421 pub new_semantic_role: Option<String>,
422 pub section_hint: Option<String>,
423 pub old_page: Option<usize>,
424 pub new_page: Option<usize>,
425 pub old_bbox: Option<Rect>,
426 pub new_bbox: Option<Rect>,
427 pub old_text: Option<String>,
428 pub new_text: Option<String>,
429 pub text_hunks: Vec<TextHunk>,
430 #[serde(default, skip_serializing_if = "Option::is_none")]
431 pub layout_diff: Option<LayoutDiff>,
432 pub provenance: Vec<Provenance>,
433}
434
435#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
436pub struct AiDiagnosticCount {
437 pub code: String,
438 pub count: usize,
439}
440
441impl DiffDocument {
442 #[must_use]
443 pub fn empty(old_fingerprint: impl Into<String>, new_fingerprint: impl Into<String>) -> Self {
444 Self {
445 schema_version: DIFF_SCHEMA_VERSION.to_owned(),
446 old_fingerprint: old_fingerprint.into(),
447 new_fingerprint: new_fingerprint.into(),
448 summary: DiffSummary::default(),
449 changes: Vec::new(),
450 diagnostics: Vec::new(),
451 }
452 }
453}