1use std::collections::{BTreeMap, BTreeSet};
2
3use spdfdiff_types::{
4 AiConfidenceBucket, AiDiagnosticCount, AiEvidenceBundle, AiReviewAnswer, AiReviewItem,
5 AiReviewQuestionHint, AiReviewReport, AiReviewSummary, AiReviewTag, ChangeKind, DiffDocument,
6 LayoutDiff, PdfDiffError, Rect, SemanticChange,
7};
8
9pub fn to_json(document: &DiffDocument) -> Result<String, PdfDiffError> {
10 serde_json::to_string_pretty(document)
11 .map_err(|error| PdfDiffError::InternalInvariant(error.to_string()))
12}
13
14pub fn to_ai_review_json(document: &DiffDocument) -> Result<String, PdfDiffError> {
15 serde_json::to_string_pretty(&build_ai_review_report(document))
16 .map_err(|error| PdfDiffError::InternalInvariant(error.to_string()))
17}
18
19#[must_use]
20pub fn build_ai_review_report(document: &DiffDocument) -> AiReviewReport {
21 let review_items = document
22 .changes
23 .iter()
24 .map(build_ai_review_item)
25 .collect::<Vec<_>>();
26 let unsupported_surface_count = document
27 .diagnostics
28 .iter()
29 .filter(|diagnostic| diagnostic.code.starts_with("UNSUPPORTED_"))
30 .count();
31 let low_confidence_change_count = review_items
32 .iter()
33 .filter(|item| item.confidence_bucket == AiConfidenceBucket::Low)
34 .count();
35
36 AiReviewReport {
37 schema_version: "0.1.0".into(),
38 source_schema_version: document.schema_version.clone(),
39 old_fingerprint: document.old_fingerprint.clone(),
40 new_fingerprint: document.new_fingerprint.clone(),
41 summary: AiReviewSummary {
42 total_changes: document.changes.len(),
43 inserted: document.summary.inserted,
44 deleted: document.summary.deleted,
45 modified: document.summary.modified,
46 moved: document.summary.moved,
47 layout_changed: document.summary.layout_changed,
48 diagnostic_count: document.diagnostics.len(),
49 low_confidence_change_count,
50 unsupported_surface_count,
51 },
52 question_hints: build_question_hints(&review_items, unsupported_surface_count),
53 review_items,
54 diagnostic_summary: diagnostic_summary(document),
55 }
56}
57
58#[must_use]
59pub fn to_html(document: &DiffDocument) -> String {
60 let mut output = String::from(
61 "<!doctype html><html><head><meta charset=\"utf-8\"><style>\
62body{font-family:system-ui,-apple-system,Segoe UI,sans-serif;margin:24px;color:#1f2933;background:#fff}\
63table{border-collapse:collapse;width:100%;margin:12px 0}th,td{border:1px solid #d9e2ec;padding:8px;vertical-align:top;text-align:left}\
64th{background:#f0f4f8}.change{margin:16px 0;border:1px solid #d9e2ec}.change h3{margin:0;padding:10px;background:#f8fafc}\
65.meta{color:#52606d;font-size:0.9rem}.hunks code{display:inline-block;margin:2px 4px 2px 0;padding:2px 4px;background:#f0f4f8}\
66.overlay-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:12px;margin:12px 0}.overlay{border:1px solid #d9e2ec;padding:8px;background:#fbfdff}.overlay svg{width:100%;height:auto;max-height:240px;background:#fff}.overlay rect{fill:rgba(37,99,235,.12);stroke:#2563eb;stroke-width:1.5}.overlay text{font-size:10px;fill:#102a43}\
67.diagnostic{margin:4px 0}</style><title>Semantic PDF Diff</title></head><body>",
68 );
69 output.push_str("<h1>Semantic PDF Diff</h1>");
70 output.push_str("<table><thead><tr><th>Metric</th><th>Count</th></tr></thead><tbody>");
71 for (label, count) in [
72 ("Inserted", document.summary.inserted),
73 ("Deleted", document.summary.deleted),
74 ("Modified", document.summary.modified),
75 ("Moved", document.summary.moved),
76 ("Layout changed", document.summary.layout_changed),
77 ] {
78 output.push_str(&format!("<tr><td>{label}</td><td>{count}</td></tr>",));
79 }
80 output.push_str("</tbody></table>");
81 push_html_overlays(&mut output, document);
82
83 output.push_str("<h2>Changes</h2>");
84 if document.changes.is_empty() {
85 output.push_str("<p>No semantic changes detected.</p>");
86 } else {
87 for change in &document.changes {
88 output.push_str(&format!(
89 "<section class=\"change\"><h3>{} {:?} {:?}</h3><p class=\"meta\">confidence {:.3}: {}</p>",
90 escape_html(&change.id),
91 change.kind,
92 change.severity,
93 change.confidence,
94 escape_html(&change.reason)
95 ));
96 output.push_str("<table><thead><tr><th>Old</th><th>New</th></tr></thead><tbody><tr>");
97 output.push_str("<td>");
98 push_html_evidence(&mut output, change.old_node.as_ref());
99 output.push_str("</td><td>");
100 push_html_evidence(&mut output, change.new_node.as_ref());
101 output.push_str("</td></tr></tbody></table>");
102 if !change.text_hunks.is_empty() {
103 output.push_str("<div class=\"hunks\"><strong>Text hunks</strong><br>");
104 for hunk in &change.text_hunks {
105 output.push_str(&format!(
106 "<code>{}: {} -> {}</code>",
107 escape_html(&hunk_label(hunk)),
108 escape_html(hunk.old_text.as_deref().unwrap_or("")),
109 escape_html(hunk.new_text.as_deref().unwrap_or(""))
110 ));
111 }
112 output.push_str("</div>");
113 }
114 if let Some(layout_diff) = &change.layout_diff {
115 output.push_str(&format!(
116 "<div class=\"meta\"><strong>Layout diff</strong>: {}</div>",
117 escape_html(&layout_diff_summary(layout_diff))
118 ));
119 }
120 output.push_str("</section>");
121 }
122 }
123
124 output.push_str("<h2>Diagnostics</h2>");
125 if document.diagnostics.is_empty() {
126 output.push_str("<p>No diagnostics.</p>");
127 } else {
128 for diagnostic in &document.diagnostics {
129 output.push_str(&format!(
130 "<div class=\"diagnostic\"><code>{:?}</code> <code>{}</code> {}</div>",
131 diagnostic.severity,
132 escape_html(&diagnostic.code),
133 escape_html(&diagnostic.message)
134 ));
135 }
136 }
137 output.push_str("</body></html>");
138 output
139}
140
141#[derive(Debug, Clone)]
142struct OverlayRect {
143 change_id: String,
144 node_id: String,
145 bbox: Rect,
146}
147
148fn push_html_overlays(output: &mut String, document: &DiffDocument) {
149 let mut overlays: BTreeMap<(&'static str, usize), Vec<OverlayRect>> = BTreeMap::new();
150 for change in &document.changes {
151 push_overlay_rect(&mut overlays, "Old", change, change.old_node.as_ref());
152 push_overlay_rect(&mut overlays, "New", change, change.new_node.as_ref());
153 }
154 if overlays.is_empty() {
155 return;
156 }
157
158 output.push_str("<h2>Page Evidence Overlays</h2>");
159 output.push_str(
160 "<p class=\"meta\">Inline SVG rectangles use PDF user-space coordinates from extracted node bounding boxes.</p>",
161 );
162 output.push_str("<div class=\"overlay-grid\">");
163 for ((role, page), mut rects) in overlays {
164 rects.sort_by(|left, right| {
165 left.change_id
166 .cmp(&right.change_id)
167 .then_with(|| left.node_id.cmp(&right.node_id))
168 });
169 output.push_str(&format!(
170 "<section class=\"overlay\"><h3>{} page {}</h3>",
171 role,
172 page + 1
173 ));
174 push_svg_overlay(output, &rects);
175 output.push_str("</section>");
176 }
177 output.push_str("</div>");
178}
179
180fn push_overlay_rect(
181 overlays: &mut BTreeMap<(&'static str, usize), Vec<OverlayRect>>,
182 role: &'static str,
183 change: &SemanticChange,
184 evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
185) {
186 let Some(evidence) = evidence else {
187 return;
188 };
189 let Some(bbox) = evidence.bbox else {
190 return;
191 };
192 if !is_reportable_rect(bbox) {
193 return;
194 }
195 overlays
196 .entry((role, evidence.page))
197 .or_default()
198 .push(OverlayRect {
199 change_id: change.id.clone(),
200 node_id: evidence.node_id.clone(),
201 bbox,
202 });
203}
204
205fn push_svg_overlay(output: &mut String, rects: &[OverlayRect]) {
206 let Some((x0, y0, x1, y1)) = overlay_bounds(rects) else {
207 return;
208 };
209 let margin = 8.0;
210 let view_x = x0 - margin;
211 let view_y = y0 - margin;
212 let view_width = (x1 - x0 + margin * 2.0).max(1.0);
213 let view_height = (y1 - y0 + margin * 2.0).max(1.0);
214 output.push_str(&format!(
215 "<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"{view_x:.2} {view_y:.2} {view_width:.2} {view_height:.2}\" role=\"img\" aria-label=\"PDF user-space evidence overlay\">"
216 ));
217 for rect in rects {
218 let (x, y, width, height) = normalized_rect(rect.bbox);
219 output.push_str(&format!(
220 "<rect x=\"{x:.2}\" y=\"{y:.2}\" width=\"{width:.2}\" height=\"{height:.2}\" data-change=\"{}\" data-node=\"{}\"><title>{} {}</title></rect>",
221 escape_html(&rect.change_id),
222 escape_html(&rect.node_id),
223 escape_html(&rect.change_id),
224 escape_html(&rect.node_id)
225 ));
226 output.push_str(&format!(
227 "<text x=\"{:.2}\" y=\"{:.2}\">{}</text>",
228 x,
229 y - 2.0,
230 escape_html(&rect.change_id)
231 ));
232 }
233 output.push_str("</svg>");
234}
235
236fn overlay_bounds(rects: &[OverlayRect]) -> Option<(f32, f32, f32, f32)> {
237 let mut iter = rects.iter().map(|rect| normalized_rect(rect.bbox));
238 let (mut x0, mut y0, width, height) = iter.next()?;
239 let mut x1 = x0 + width;
240 let mut y1 = y0 + height;
241 for (x, y, width, height) in iter {
242 x0 = x0.min(x);
243 y0 = y0.min(y);
244 x1 = x1.max(x + width);
245 y1 = y1.max(y + height);
246 }
247 Some((x0, y0, x1, y1))
248}
249
250fn normalized_rect(rect: Rect) -> (f32, f32, f32, f32) {
251 let x0 = rect.x0.min(rect.x1);
252 let y0 = rect.y0.min(rect.y1);
253 let x1 = rect.x0.max(rect.x1);
254 let y1 = rect.y0.max(rect.y1);
255 (x0, y0, (x1 - x0).max(0.1), (y1 - y0).max(0.1))
256}
257
258fn is_reportable_rect(rect: Rect) -> bool {
259 rect.x0.is_finite()
260 && rect.y0.is_finite()
261 && rect.x1.is_finite()
262 && rect.y1.is_finite()
263 && (rect.x1 - rect.x0).abs() > 0.0
264 && (rect.y1 - rect.y0).abs() > 0.0
265}
266
267fn build_ai_review_item(change: &SemanticChange) -> AiReviewItem {
268 let tags = review_tags(change);
269 AiReviewItem {
270 change_id: change.id.clone(),
271 kind: change.kind.clone(),
272 severity: change.severity,
273 confidence: change.confidence,
274 confidence_bucket: confidence_bucket(change.confidence),
275 explanation: review_explanation(change, &tags),
276 evidence: evidence_bundle(change),
277 tags,
278 }
279}
280
281fn confidence_bucket(confidence: f32) -> AiConfidenceBucket {
282 if confidence >= 0.9 {
283 AiConfidenceBucket::High
284 } else if confidence >= 0.75 {
285 AiConfidenceBucket::Medium
286 } else {
287 AiConfidenceBucket::Low
288 }
289}
290
291fn review_tags(change: &SemanticChange) -> Vec<AiReviewTag> {
292 let mut tags = BTreeSet::new();
293 match change.kind {
294 ChangeKind::Inserted => {
295 tags.insert(AiReviewTag::ContentInserted);
296 }
297 ChangeKind::Deleted => {
298 tags.insert(AiReviewTag::ContentDeleted);
299 }
300 ChangeKind::Modified => {
301 tags.insert(AiReviewTag::TextChanged);
302 }
303 ChangeKind::Moved => {
304 tags.insert(AiReviewTag::ContentMoved);
305 }
306 ChangeKind::LayoutChanged => {
307 tags.insert(AiReviewTag::LayoutOnly);
308 }
309 ChangeKind::AnnotationChanged => {
310 tags.insert(AiReviewTag::AnnotationOrLinkChanged);
311 }
312 ChangeKind::FormFieldChanged => {
313 tags.insert(AiReviewTag::FormFieldChanged);
314 }
315 ChangeKind::MetadataChanged => {
316 tags.insert(AiReviewTag::MetadataChanged);
317 }
318 ChangeKind::ObjectChanged | ChangeKind::StyleChanged => {
319 tags.insert(AiReviewTag::VisualSurfaceChanged);
320 }
321 ChangeKind::Unknown => {}
322 }
323
324 let text = change_text(change);
325 let lower_text = text.to_lowercase();
326 if has_any(
327 &lower_text,
328 &[
329 "payment",
330 "invoice",
331 "amount",
332 "fee",
333 "price",
334 "revenue",
335 "total",
336 "usd",
337 "$",
338 "maintenance",
339 "schedule",
340 ],
341 ) {
342 tags.insert(AiReviewTag::PaymentTermsCandidate);
343 }
344 if has_any(
345 &lower_text,
346 &[
347 "day", "days", "date", "term", "notice", "year", "annual", "month", "weekly",
348 ],
349 ) {
350 tags.insert(AiReviewTag::DateOrDurationCandidate);
351 }
352 if has_any(
353 &lower_text,
354 &[
355 "corp",
356 "llc",
357 "inc",
358 "client",
359 "vendor",
360 "party",
361 "contractor",
362 ],
363 ) {
364 tags.insert(AiReviewTag::PartyNameCandidate);
365 }
366 if change.text_hunks.iter().any(hunk_has_digit_change) {
367 tags.insert(AiReviewTag::NumericValueChanged);
368 }
369 if is_repeated_page_region_change(change) {
370 tags.insert(AiReviewTag::RepeatedPageRegion);
371 }
372 if change.confidence < 0.75 {
373 tags.insert(AiReviewTag::LowConfidence);
374 }
375 if change.reason.contains("UNSUPPORTED_") {
376 tags.insert(AiReviewTag::UnsupportedSurface);
377 }
378
379 tags.into_iter().collect()
380}
381
382fn hunk_has_digit_change(hunk: &spdfdiff_types::TextHunk) -> bool {
383 hunk.old_text
384 .as_deref()
385 .is_some_and(|text| text.chars().any(|character| character.is_ascii_digit()))
386 || hunk
387 .new_text
388 .as_deref()
389 .is_some_and(|text| text.chars().any(|character| character.is_ascii_digit()))
390}
391
392fn is_repeated_page_region_change(change: &SemanticChange) -> bool {
393 [change.old_node.as_ref(), change.new_node.as_ref()]
394 .into_iter()
395 .flatten()
396 .filter_map(|node| node.semantic_role.as_deref())
397 .any(|role| {
398 matches!(
399 role,
400 "HeaderCandidate" | "FooterCandidate" | "PageTemplateCandidate"
401 )
402 })
403}
404
405fn has_any(value: &str, needles: &[&str]) -> bool {
406 needles.iter().any(|needle| value.contains(needle))
407}
408
409fn change_text(change: &SemanticChange) -> String {
410 [
411 change
412 .old_node
413 .as_ref()
414 .and_then(|node| node.text.as_deref())
415 .unwrap_or_default(),
416 change
417 .new_node
418 .as_ref()
419 .and_then(|node| node.text.as_deref())
420 .unwrap_or_default(),
421 ]
422 .join(" ")
423}
424
425fn review_explanation(change: &SemanticChange, tags: &[AiReviewTag]) -> String {
426 let mut parts = vec![match change.kind {
427 ChangeKind::Inserted => "Content was inserted.".to_owned(),
428 ChangeKind::Deleted => "Content was deleted.".to_owned(),
429 ChangeKind::Modified => "Text changed between matched semantic nodes.".to_owned(),
430 ChangeKind::Moved => {
431 "Content appears to have moved without a primary text change.".to_owned()
432 }
433 ChangeKind::LayoutChanged => {
434 "Layout changed while text evidence stayed comparable.".to_owned()
435 }
436 ChangeKind::StyleChanged => "A style-facing surface changed.".to_owned(),
437 ChangeKind::MetadataChanged => "A metadata-facing surface changed.".to_owned(),
438 ChangeKind::AnnotationChanged => "An annotation or link surface changed.".to_owned(),
439 ChangeKind::FormFieldChanged => "A form-field surface changed.".to_owned(),
440 ChangeKind::ObjectChanged => "A report-facing PDF object surface changed.".to_owned(),
441 ChangeKind::Unknown => "A change was detected but not classified further.".to_owned(),
442 }];
443
444 if tags.contains(&AiReviewTag::PaymentTermsCandidate) {
445 parts.push("Payment or amount terms are mentioned; treat this as a review candidate, not a legal conclusion.".into());
446 }
447 if tags.contains(&AiReviewTag::DateOrDurationCandidate) {
448 parts.push("Date, duration, or notice language is mentioned.".into());
449 }
450 if tags.contains(&AiReviewTag::RepeatedPageRegion) {
451 parts.push("The changed evidence is classified as repeated page-region content such as a header, footer, or page template candidate.".into());
452 }
453 if tags.contains(&AiReviewTag::LowConfidence) {
454 parts.push("Confidence is low; inspect extraction diagnostics and source evidence.".into());
455 }
456 parts.push(change.reason.clone());
457 parts.join(" ")
458}
459
460fn evidence_bundle(change: &SemanticChange) -> AiEvidenceBundle {
461 let mut provenance = Vec::new();
462 if let Some(old_node) = &change.old_node {
463 provenance.extend(old_node.source.clone());
464 }
465 if let Some(new_node) = &change.new_node {
466 provenance.extend(new_node.source.clone());
467 }
468
469 AiEvidenceBundle {
470 old_node_id: change.old_node.as_ref().map(|node| node.node_id.clone()),
471 new_node_id: change.new_node.as_ref().map(|node| node.node_id.clone()),
472 old_semantic_role: change
473 .old_node
474 .as_ref()
475 .and_then(|node| node.semantic_role.clone()),
476 new_semantic_role: change
477 .new_node
478 .as_ref()
479 .and_then(|node| node.semantic_role.clone()),
480 section_hint: section_hint(change),
481 old_page: change.old_node.as_ref().map(|node| node.page),
482 new_page: change.new_node.as_ref().map(|node| node.page),
483 old_bbox: change.old_node.as_ref().and_then(|node| node.bbox),
484 new_bbox: change.new_node.as_ref().and_then(|node| node.bbox),
485 old_text: change.old_node.as_ref().and_then(|node| node.text.clone()),
486 new_text: change.new_node.as_ref().and_then(|node| node.text.clone()),
487 text_hunks: change.text_hunks.clone(),
488 layout_diff: change.layout_diff.clone(),
489 provenance,
490 }
491}
492
493fn section_hint(change: &SemanticChange) -> Option<String> {
494 change
495 .new_node
496 .as_ref()
497 .and_then(|node| node.text.as_deref())
498 .and_then(section_hint_from_text)
499 .or_else(|| {
500 change
501 .old_node
502 .as_ref()
503 .and_then(|node| node.text.as_deref())
504 .and_then(section_hint_from_text)
505 })
506}
507
508fn section_hint_from_text(text: &str) -> Option<String> {
509 let trimmed = text.trim();
510 if trimmed.is_empty() {
511 return None;
512 }
513
514 let lower = trimmed.to_lowercase();
515 if lower.starts_with("section ") || lower.starts_with("clause ") {
516 return Some(first_words(trimmed, 10));
517 }
518
519 let first_token = trimmed.split_whitespace().next().unwrap_or_default();
520 let looks_numbered = first_token
521 .chars()
522 .any(|character| character.is_ascii_digit())
523 && (first_token.ends_with('.') || first_token.ends_with(')') || first_token.contains('.'));
524 if looks_numbered {
525 Some(first_words(trimmed, 10))
526 } else {
527 None
528 }
529}
530
531fn first_words(text: &str, limit: usize) -> String {
532 let mut value = text
533 .split_whitespace()
534 .take(limit)
535 .collect::<Vec<_>>()
536 .join(" ");
537 if value.len() > 96 {
538 value.truncate(96);
539 value = value.trim_end().to_owned();
540 }
541 value
542}
543
544fn build_question_hints(
545 review_items: &[AiReviewItem],
546 unsupported_surface_count: usize,
547) -> Vec<AiReviewQuestionHint> {
548 vec![
549 question_hint(
550 "Which contractual obligations changed?",
551 review_items,
552 |item| {
553 item.tags.iter().any(|tag| {
554 matches!(
555 tag,
556 AiReviewTag::TextChanged
557 | AiReviewTag::ContentInserted
558 | AiReviewTag::ContentDeleted
559 | AiReviewTag::ContentMoved
560 )
561 }) && change_text_mentions_obligation(&item.evidence)
562 },
563 "Candidate obligation changes are based on obligation-like keywords and semantic change evidence.",
564 ),
565 question_hint(
566 "Were payment terms modified?",
567 review_items,
568 |item| item.tags.contains(&AiReviewTag::PaymentTermsCandidate),
569 "Payment-term candidates are based on payment, invoice, amount, or currency language in changed evidence.",
570 ),
571 question_hint(
572 "Did layout change without text changing?",
573 review_items,
574 |item| item.tags.contains(&AiReviewTag::LayoutOnly),
575 "Layout-only answers use changes classified separately from text modifications.",
576 ),
577 question_hint(
578 "Which changes are low-confidence because extraction was incomplete?",
579 review_items,
580 |item| item.tags.contains(&AiReviewTag::LowConfidence),
581 "Low-confidence answers use the engine confidence bucket and should be cross-checked with diagnostics.",
582 ),
583 question_hint(
584 "Did repeated page regions change?",
585 review_items,
586 |item| item.tags.contains(&AiReviewTag::RepeatedPageRegion),
587 "Repeated page-region answers use semantic header, footer, and page-template candidate evidence.",
588 ),
589 AiReviewQuestionHint {
590 question: "Were unsupported PDF surfaces encountered?".into(),
591 answer: if unsupported_surface_count > 0 {
592 AiReviewAnswer::Yes
593 } else {
594 AiReviewAnswer::No
595 },
596 supporting_change_ids: Vec::new(),
597 rationale: "Unsupported surfaces are counted from stable diagnostic codes that start with UNSUPPORTED_.".into(),
598 },
599 ]
600}
601
602fn question_hint(
603 question: &str,
604 review_items: &[AiReviewItem],
605 predicate: impl Fn(&AiReviewItem) -> bool,
606 rationale: &str,
607) -> AiReviewQuestionHint {
608 let supporting_change_ids = review_items
609 .iter()
610 .filter(|item| predicate(item))
611 .map(|item| item.change_id.clone())
612 .collect::<Vec<_>>();
613 AiReviewQuestionHint {
614 question: question.into(),
615 answer: if supporting_change_ids.is_empty() {
616 AiReviewAnswer::No
617 } else {
618 AiReviewAnswer::Yes
619 },
620 supporting_change_ids,
621 rationale: rationale.into(),
622 }
623}
624
625fn change_text_mentions_obligation(evidence: &AiEvidenceBundle) -> bool {
626 let text = [
627 evidence.old_text.as_deref().unwrap_or_default(),
628 evidence.new_text.as_deref().unwrap_or_default(),
629 ]
630 .join(" ")
631 .to_lowercase();
632 has_any(
633 &text,
634 &[
635 "shall",
636 "must",
637 "required",
638 "obligation",
639 "liable",
640 "liability",
641 "indemnification",
642 "termination",
643 "notice",
644 "payment",
645 ],
646 )
647}
648
649fn diagnostic_summary(document: &DiffDocument) -> Vec<AiDiagnosticCount> {
650 let mut counts = BTreeMap::new();
651 for diagnostic in &document.diagnostics {
652 *counts.entry(diagnostic.code.clone()).or_insert(0) += 1;
653 }
654 counts
655 .into_iter()
656 .map(|(code, count)| AiDiagnosticCount { code, count })
657 .collect()
658}
659
660#[must_use]
661pub fn to_markdown(document: &DiffDocument) -> String {
662 let mut output = format!(
663 "# Semantic PDF Diff\n\n| Metric | Count |\n| --- | ---: |\n| Inserted | {} |\n| Deleted | {} |\n| Modified | {} |\n| Moved | {} |\n| Layout changed | {} |\n\n",
664 document.summary.inserted,
665 document.summary.deleted,
666 document.summary.modified,
667 document.summary.moved,
668 document.summary.layout_changed
669 );
670
671 output.push_str("## Changes\n\n");
672 if document.changes.is_empty() {
673 output.push_str("No semantic changes detected.\n\n");
674 } else {
675 for change in &document.changes {
676 output.push_str(&format!(
677 "- `{}` {:?} {:?}: {}\n",
678 change.id, change.kind, change.severity, change.reason
679 ));
680 push_evidence_line(&mut output, "Old", change.old_node.as_ref());
681 push_evidence_line(&mut output, "New", change.new_node.as_ref());
682 if !change.text_hunks.is_empty() {
683 output.push_str(" - Text hunks:");
684 for hunk in &change.text_hunks {
685 output.push_str(&format!(
686 " `{}` \"{}\" -> \"{}\"",
687 hunk_label(hunk),
688 hunk.old_text.as_deref().unwrap_or_default(),
689 hunk.new_text.as_deref().unwrap_or_default()
690 ));
691 }
692 output.push('\n');
693 }
694 if let Some(layout_diff) = &change.layout_diff {
695 output.push_str(&format!(
696 " - Layout diff: {}\n",
697 layout_diff_summary(layout_diff)
698 ));
699 }
700 }
701 output.push('\n');
702 }
703
704 output.push_str("## Diagnostics\n\n");
705 if document.diagnostics.is_empty() {
706 output.push_str("No diagnostics.\n");
707 } else {
708 for diagnostic in &document.diagnostics {
709 output.push_str(&format!(
710 "- `{:?}` `{}` {}\n",
711 diagnostic.severity, diagnostic.code, diagnostic.message
712 ));
713 }
714 }
715
716 output
717}
718
719fn push_html_evidence(
720 output: &mut String,
721 evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
722) {
723 let Some(evidence) = evidence else {
724 output.push_str("<em>None</em>");
725 return;
726 };
727 output.push_str(&format!(
728 "<div class=\"meta\">page {} <code>{}</code></div>",
729 evidence.page + 1,
730 escape_html(&evidence.node_id)
731 ));
732 if let Some(role) = &evidence.semantic_role {
733 output.push_str(&format!(
734 "<div class=\"meta\">semantic role <code>{}</code></div>",
735 escape_html(role)
736 ));
737 }
738 if let Some(bbox) = evidence.bbox {
739 output.push_str(&format!(
740 "<div class=\"meta\">bbox [{:.2}, {:.2}, {:.2}, {:.2}] in PDF user space</div>",
741 bbox.x0, bbox.y0, bbox.x1, bbox.y1
742 ));
743 }
744 if let Some(text) = &evidence.text {
745 output.push_str(&format!("<div>{}</div>", escape_html(text)));
746 }
747}
748
749fn escape_html(value: &str) -> String {
750 value
751 .replace('&', "&")
752 .replace('<', "<")
753 .replace('>', ">")
754 .replace('"', """)
755}
756
757fn push_evidence_line(
758 output: &mut String,
759 label: &str,
760 evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
761) {
762 let Some(evidence) = evidence else {
763 return;
764 };
765 output.push_str(&format!(
766 " - {label} page {} `{}`",
767 evidence.page + 1,
768 evidence.node_id
769 ));
770 if let Some(role) = &evidence.semantic_role {
771 output.push_str(&format!(" ({role})"));
772 }
773 if let Some(text) = &evidence.text {
774 output.push_str(&format!(": {text}"));
775 }
776 output.push('\n');
777}
778
779fn layout_diff_summary(layout_diff: &LayoutDiff) -> String {
780 let mut parts = Vec::new();
781 if let Some(delta_x) = layout_diff.delta_x {
782 parts.push(format!("dx={delta_x:.2}"));
783 }
784 if let Some(delta_y) = layout_diff.delta_y {
785 parts.push(format!("dy={delta_y:.2}"));
786 }
787 if let Some(delta_width) = layout_diff.delta_width {
788 parts.push(format!("dw={delta_width:.2}"));
789 }
790 if let Some(delta_height) = layout_diff.delta_height {
791 parts.push(format!("dh={delta_height:.2}"));
792 }
793 if layout_diff.page_changed {
794 parts.push("page_changed=true".to_owned());
795 }
796 if layout_diff.reading_order_changed {
797 parts.push("reading_order_changed=true".to_owned());
798 }
799 if parts.is_empty() {
800 "bbox changed without numeric delta".to_owned()
801 } else {
802 parts.join(", ")
803 }
804}
805
806fn hunk_label(hunk: &spdfdiff_types::TextHunk) -> String {
807 match &hunk.granularity {
808 Some(granularity) => format!("{:?}/{:?}", hunk.kind, granularity),
809 None => format!("{:?}", hunk.kind),
810 }
811}
812
813#[cfg(test)]
814mod tests {
815 use super::*;
816 use spdfdiff_types::{
817 ChangeKind, ChangeSeverity, Provenance, Rect, SemanticChange, SemanticNodeEvidence,
818 TextHunk, TextHunkKind,
819 };
820
821 #[test]
822 fn markdown_includes_summary_and_change_list() {
823 let mut document = DiffDocument::empty("old", "new");
824 document.summary.modified = 1;
825 document.changes.push(SemanticChange {
826 id: "change-0000".into(),
827 kind: ChangeKind::Modified,
828 severity: ChangeSeverity::Major,
829 old_node: Some(SemanticNodeEvidence {
830 node_id: "old-node".into(),
831 semantic_role: None,
832 page: 0,
833 bbox: Some(Rect {
834 x0: 72.0,
835 y0: 700.0,
836 x1: 240.0,
837 y1: 716.0,
838 }),
839 text: Some("Annual revenue was 10 million.".into()),
840 source: vec![Provenance::unknown()],
841 }),
842 new_node: Some(SemanticNodeEvidence {
843 node_id: "new-node".into(),
844 semantic_role: None,
845 page: 0,
846 bbox: Some(Rect {
847 x0: 72.0,
848 y0: 682.0,
849 x1: 246.0,
850 y1: 698.0,
851 }),
852 text: Some("Annual revenue was 12 million.".into()),
853 source: vec![Provenance::unknown()],
854 }),
855 text_hunks: vec![TextHunk {
856 kind: TextHunkKind::Replaced,
857 granularity: None,
858 old_range: None,
859 new_range: None,
860 old_text: Some("10".into()),
861 new_text: Some("12".into()),
862 }],
863 layout_diff: Some(LayoutDiff {
864 old_bbox: Some(Rect {
865 x0: 72.0,
866 y0: 700.0,
867 x1: 240.0,
868 y1: 716.0,
869 }),
870 new_bbox: Some(Rect {
871 x0: 72.0,
872 y0: 682.0,
873 x1: 246.0,
874 y1: 698.0,
875 }),
876 delta_x: Some(0.0),
877 delta_y: Some(-18.0),
878 delta_width: Some(6.0),
879 delta_height: Some(0.0),
880 page_changed: false,
881 reading_order_changed: false,
882 }),
883 confidence: 0.9,
884 reason: "paragraph text differs".into(),
885 });
886
887 let markdown = to_markdown(&document);
888
889 assert!(markdown.contains("| Modified | 1 |"));
890 assert!(markdown.contains("`change-0000` Modified Major"));
891 assert!(markdown.contains("Old page 1 `old-node`: Annual revenue was 10 million."));
892 assert!(markdown.contains("New page 1 `new-node`: Annual revenue was 12 million."));
893 assert!(markdown.contains("`Replaced` \"10\" -> \"12\""));
894 assert!(markdown.contains("Layout diff: dx=0.00, dy=-18.00, dw=6.00, dh=0.00"));
895 }
896
897 #[test]
898 fn html_is_self_contained_side_by_side_report() {
899 let mut document = DiffDocument::empty("old", "new");
900 document.summary.modified = 1;
901 document.changes.push(SemanticChange {
902 id: "change-0000".into(),
903 kind: ChangeKind::Modified,
904 severity: ChangeSeverity::Major,
905 old_node: Some(SemanticNodeEvidence {
906 node_id: "old-node".into(),
907 semantic_role: None,
908 page: 0,
909 bbox: Some(Rect {
910 x0: 72.0,
911 y0: 700.0,
912 x1: 240.0,
913 y1: 716.0,
914 }),
915 text: Some("Annual revenue was 10 million.".into()),
916 source: vec![Provenance::unknown()],
917 }),
918 new_node: Some(SemanticNodeEvidence {
919 node_id: "new-node".into(),
920 semantic_role: None,
921 page: 0,
922 bbox: Some(Rect {
923 x0: 72.0,
924 y0: 682.0,
925 x1: 246.0,
926 y1: 698.0,
927 }),
928 text: Some("Annual revenue was 12 million.".into()),
929 source: vec![Provenance::unknown()],
930 }),
931 text_hunks: Vec::new(),
932 layout_diff: Some(LayoutDiff {
933 old_bbox: Some(Rect {
934 x0: 72.0,
935 y0: 700.0,
936 x1: 240.0,
937 y1: 716.0,
938 }),
939 new_bbox: Some(Rect {
940 x0: 72.0,
941 y0: 682.0,
942 x1: 246.0,
943 y1: 698.0,
944 }),
945 delta_x: Some(0.0),
946 delta_y: Some(-18.0),
947 delta_width: Some(6.0),
948 delta_height: Some(0.0),
949 page_changed: false,
950 reading_order_changed: false,
951 }),
952 confidence: 0.9,
953 reason: "paragraph text differs".into(),
954 });
955
956 let html = to_html(&document);
957
958 assert!(html.contains("<!doctype html>"));
959 assert!(html.contains("<th>Old</th><th>New</th>"));
960 assert!(html.contains("<h2>Page Evidence Overlays</h2>"));
961 assert!(html.contains("<svg xmlns=\"http://www.w3.org/2000/svg\""));
962 assert!(html.contains("data-change=\"change-0000\""));
963 assert!(html.contains("bbox [72.00, 700.00, 240.00, 716.00] in PDF user space"));
964 assert!(html.contains("Layout diff"));
965 assert!(html.contains("dx=0.00, dy=-18.00, dw=6.00, dh=0.00"));
966 assert!(html.contains("Annual revenue was 10 million."));
967 assert!(html.contains("Annual revenue was 12 million."));
968 assert!(!html.contains("src=\"http"));
969 assert!(!html.contains("href=\"http"));
970 }
971
972 #[test]
973 fn ai_review_report_summarizes_questions_tags_and_evidence() {
974 let mut document = DiffDocument::empty("old.pdf", "new.pdf");
975 document.summary.modified = 1;
976 document.changes.push(SemanticChange {
977 id: "change-0000".into(),
978 kind: ChangeKind::Modified,
979 severity: ChangeSeverity::Major,
980 old_node: Some(SemanticNodeEvidence {
981 node_id: "old-node".into(),
982 semantic_role: None,
983 page: 0,
984 bbox: None,
985 text: Some("Payment is due within 30 days.".into()),
986 source: vec![Provenance::unknown()],
987 }),
988 new_node: Some(SemanticNodeEvidence {
989 node_id: "new-node".into(),
990 semantic_role: None,
991 page: 0,
992 bbox: None,
993 text: Some("Payment is due within 15 days.".into()),
994 source: vec![Provenance::unknown()],
995 }),
996 text_hunks: vec![TextHunk {
997 kind: TextHunkKind::Replaced,
998 granularity: None,
999 old_range: None,
1000 new_range: None,
1001 old_text: Some("30".into()),
1002 new_text: Some("15".into()),
1003 }],
1004 layout_diff: None,
1005 confidence: 0.91,
1006 reason: "paragraph text differs".into(),
1007 });
1008
1009 let report = build_ai_review_report(&document);
1010
1011 assert_eq!(report.summary.total_changes, 1);
1012 assert_eq!(
1013 report.review_items[0].confidence_bucket,
1014 AiConfidenceBucket::High
1015 );
1016 assert!(
1017 report.review_items[0]
1018 .tags
1019 .contains(&AiReviewTag::PaymentTermsCandidate)
1020 );
1021 assert!(
1022 report.review_items[0]
1023 .tags
1024 .contains(&AiReviewTag::NumericValueChanged)
1025 );
1026 assert_eq!(
1027 report.review_items[0].evidence.old_node_id.as_deref(),
1028 Some("old-node")
1029 );
1030 assert_eq!(
1031 report.review_items[0].evidence.new_node_id.as_deref(),
1032 Some("new-node")
1033 );
1034 assert_eq!(
1035 report.review_items[0].evidence.old_text.as_deref(),
1036 Some("Payment is due within 30 days.")
1037 );
1038 let payment_hint = report
1039 .question_hints
1040 .iter()
1041 .find(|hint| hint.question == "Were payment terms modified?")
1042 .expect("payment question hint should be present");
1043 assert_eq!(payment_hint.answer, AiReviewAnswer::Yes);
1044 assert_eq!(payment_hint.supporting_change_ids, vec!["change-0000"]);
1045 }
1046
1047 #[test]
1048 fn ai_review_report_tags_repeated_page_region_changes() {
1049 let mut document = DiffDocument::empty("old.pdf", "new.pdf");
1050 document.summary.modified = 1;
1051 document.changes.push(SemanticChange {
1052 id: "change-0000".into(),
1053 kind: ChangeKind::Modified,
1054 severity: ChangeSeverity::Minor,
1055 old_node: Some(SemanticNodeEvidence {
1056 node_id: "old-header".into(),
1057 semantic_role: Some("HeaderCandidate".into()),
1058 page: 0,
1059 bbox: None,
1060 text: Some("DocID: 994-A".into()),
1061 source: vec![Provenance::unknown()],
1062 }),
1063 new_node: Some(SemanticNodeEvidence {
1064 node_id: "new-header".into(),
1065 semantic_role: Some("HeaderCandidate".into()),
1066 page: 0,
1067 bbox: None,
1068 text: Some("DocID: 994-B".into()),
1069 source: vec![Provenance::unknown()],
1070 }),
1071 text_hunks: Vec::new(),
1072 layout_diff: None,
1073 confidence: 0.82,
1074 reason: "repeated header text differs".into(),
1075 });
1076
1077 let report = build_ai_review_report(&document);
1078
1079 assert!(
1080 report.review_items[0]
1081 .tags
1082 .contains(&AiReviewTag::RepeatedPageRegion)
1083 );
1084 assert_eq!(
1085 report.review_items[0].evidence.old_semantic_role.as_deref(),
1086 Some("HeaderCandidate")
1087 );
1088 assert_eq!(
1089 report.review_items[0].evidence.new_semantic_role.as_deref(),
1090 Some("HeaderCandidate")
1091 );
1092 assert!(
1093 report.review_items[0]
1094 .explanation
1095 .contains("repeated page-region content")
1096 );
1097 let hint = report
1098 .question_hints
1099 .iter()
1100 .find(|hint| hint.question == "Did repeated page regions change?")
1101 .expect("repeated page-region question hint should be present");
1102 assert_eq!(hint.answer, AiReviewAnswer::Yes);
1103 assert_eq!(hint.supporting_change_ids, vec!["change-0000"]);
1104 }
1105}