Skip to main content

diff_report/
lib.rs

1use std::collections::{BTreeMap, BTreeSet};
2
3use spdfdiff_types::{
4    AiConfidenceBucket, AiDiagnosticCount, AiEvidenceBundle, AiReviewAnswer, AiReviewItem,
5    AiReviewQuestionHint, AiReviewReport, AiReviewSummary, AiReviewTag, ChangeKind, DiffDocument,
6    LayoutDiff, PdfDiffError, Rect, SemanticChange,
7};
8
9pub fn to_json(document: &DiffDocument) -> Result<String, PdfDiffError> {
10    serde_json::to_string_pretty(document)
11        .map_err(|error| PdfDiffError::InternalInvariant(error.to_string()))
12}
13
14pub fn to_ai_review_json(document: &DiffDocument) -> Result<String, PdfDiffError> {
15    serde_json::to_string_pretty(&build_ai_review_report(document))
16        .map_err(|error| PdfDiffError::InternalInvariant(error.to_string()))
17}
18
19#[must_use]
20pub fn build_ai_review_report(document: &DiffDocument) -> AiReviewReport {
21    let review_items = document
22        .changes
23        .iter()
24        .map(build_ai_review_item)
25        .collect::<Vec<_>>();
26    let unsupported_surface_count = document
27        .diagnostics
28        .iter()
29        .filter(|diagnostic| diagnostic.code.starts_with("UNSUPPORTED_"))
30        .count();
31    let low_confidence_change_count = review_items
32        .iter()
33        .filter(|item| item.confidence_bucket == AiConfidenceBucket::Low)
34        .count();
35
36    AiReviewReport {
37        schema_version: "0.1.0".into(),
38        source_schema_version: document.schema_version.clone(),
39        old_fingerprint: document.old_fingerprint.clone(),
40        new_fingerprint: document.new_fingerprint.clone(),
41        summary: AiReviewSummary {
42            total_changes: document.changes.len(),
43            inserted: document.summary.inserted,
44            deleted: document.summary.deleted,
45            modified: document.summary.modified,
46            moved: document.summary.moved,
47            layout_changed: document.summary.layout_changed,
48            diagnostic_count: document.diagnostics.len(),
49            low_confidence_change_count,
50            unsupported_surface_count,
51        },
52        question_hints: build_question_hints(&review_items, unsupported_surface_count),
53        review_items,
54        diagnostic_summary: diagnostic_summary(document),
55    }
56}
57
58#[must_use]
59pub fn to_html(document: &DiffDocument) -> String {
60    let mut output = String::from(
61        "<!doctype html><html><head><meta charset=\"utf-8\"><style>\
62body{font-family:system-ui,-apple-system,Segoe UI,sans-serif;margin:24px;color:#1f2933;background:#fff}\
63table{border-collapse:collapse;width:100%;margin:12px 0}th,td{border:1px solid #d9e2ec;padding:8px;vertical-align:top;text-align:left}\
64th{background:#f0f4f8}.change{margin:16px 0;border:1px solid #d9e2ec}.change h3{margin:0;padding:10px;background:#f8fafc}\
65.meta{color:#52606d;font-size:0.9rem}.hunks code{display:inline-block;margin:2px 4px 2px 0;padding:2px 4px;background:#f0f4f8}\
66.overlay-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:12px;margin:12px 0}.overlay{border:1px solid #d9e2ec;padding:8px;background:#fbfdff}.overlay svg{width:100%;height:auto;max-height:240px;background:#fff}.overlay rect{fill:rgba(37,99,235,.12);stroke:#2563eb;stroke-width:1.5}.overlay text{font-size:10px;fill:#102a43}\
67.diagnostic{margin:4px 0}</style><title>Semantic PDF Diff</title></head><body>",
68    );
69    output.push_str("<h1>Semantic PDF Diff</h1>");
70    output.push_str("<table><thead><tr><th>Metric</th><th>Count</th></tr></thead><tbody>");
71    for (label, count) in [
72        ("Inserted", document.summary.inserted),
73        ("Deleted", document.summary.deleted),
74        ("Modified", document.summary.modified),
75        ("Moved", document.summary.moved),
76        ("Layout changed", document.summary.layout_changed),
77    ] {
78        output.push_str(&format!("<tr><td>{label}</td><td>{count}</td></tr>",));
79    }
80    output.push_str("</tbody></table>");
81    push_html_overlays(&mut output, document);
82
83    output.push_str("<h2>Changes</h2>");
84    if document.changes.is_empty() {
85        output.push_str("<p>No semantic changes detected.</p>");
86    } else {
87        for change in &document.changes {
88            output.push_str(&format!(
89                "<section class=\"change\"><h3>{} {:?} {:?}</h3><p class=\"meta\">confidence {:.3}: {}</p>",
90                escape_html(&change.id),
91                change.kind,
92                change.severity,
93                change.confidence,
94                escape_html(&change.reason)
95            ));
96            output.push_str("<table><thead><tr><th>Old</th><th>New</th></tr></thead><tbody><tr>");
97            output.push_str("<td>");
98            push_html_evidence(&mut output, change.old_node.as_ref());
99            output.push_str("</td><td>");
100            push_html_evidence(&mut output, change.new_node.as_ref());
101            output.push_str("</td></tr></tbody></table>");
102            if !change.text_hunks.is_empty() {
103                output.push_str("<div class=\"hunks\"><strong>Text hunks</strong><br>");
104                for hunk in &change.text_hunks {
105                    output.push_str(&format!(
106                        "<code>{}: {} -> {}</code>",
107                        escape_html(&hunk_label(hunk)),
108                        escape_html(hunk.old_text.as_deref().unwrap_or("")),
109                        escape_html(hunk.new_text.as_deref().unwrap_or(""))
110                    ));
111                }
112                output.push_str("</div>");
113            }
114            if let Some(layout_diff) = &change.layout_diff {
115                output.push_str(&format!(
116                    "<div class=\"meta\"><strong>Layout diff</strong>: {}</div>",
117                    escape_html(&layout_diff_summary(layout_diff))
118                ));
119            }
120            output.push_str("</section>");
121        }
122    }
123
124    output.push_str("<h2>Diagnostics</h2>");
125    if document.diagnostics.is_empty() {
126        output.push_str("<p>No diagnostics.</p>");
127    } else {
128        for diagnostic in &document.diagnostics {
129            output.push_str(&format!(
130                "<div class=\"diagnostic\"><code>{:?}</code> <code>{}</code> {}</div>",
131                diagnostic.severity,
132                escape_html(&diagnostic.code),
133                escape_html(&diagnostic.message)
134            ));
135        }
136    }
137    output.push_str("</body></html>");
138    output
139}
140
141#[derive(Debug, Clone)]
142struct OverlayRect {
143    change_id: String,
144    node_id: String,
145    bbox: Rect,
146}
147
148fn push_html_overlays(output: &mut String, document: &DiffDocument) {
149    let mut overlays: BTreeMap<(&'static str, usize), Vec<OverlayRect>> = BTreeMap::new();
150    for change in &document.changes {
151        push_overlay_rect(&mut overlays, "Old", change, change.old_node.as_ref());
152        push_overlay_rect(&mut overlays, "New", change, change.new_node.as_ref());
153    }
154    if overlays.is_empty() {
155        return;
156    }
157
158    output.push_str("<h2>Page Evidence Overlays</h2>");
159    output.push_str(
160        "<p class=\"meta\">Inline SVG rectangles use PDF user-space coordinates from extracted node bounding boxes.</p>",
161    );
162    output.push_str("<div class=\"overlay-grid\">");
163    for ((role, page), mut rects) in overlays {
164        rects.sort_by(|left, right| {
165            left.change_id
166                .cmp(&right.change_id)
167                .then_with(|| left.node_id.cmp(&right.node_id))
168        });
169        output.push_str(&format!(
170            "<section class=\"overlay\"><h3>{} page {}</h3>",
171            role,
172            page + 1
173        ));
174        push_svg_overlay(output, &rects);
175        output.push_str("</section>");
176    }
177    output.push_str("</div>");
178}
179
180fn push_overlay_rect(
181    overlays: &mut BTreeMap<(&'static str, usize), Vec<OverlayRect>>,
182    role: &'static str,
183    change: &SemanticChange,
184    evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
185) {
186    let Some(evidence) = evidence else {
187        return;
188    };
189    let Some(bbox) = evidence.bbox else {
190        return;
191    };
192    if !is_reportable_rect(bbox) {
193        return;
194    }
195    overlays
196        .entry((role, evidence.page))
197        .or_default()
198        .push(OverlayRect {
199            change_id: change.id.clone(),
200            node_id: evidence.node_id.clone(),
201            bbox,
202        });
203}
204
205fn push_svg_overlay(output: &mut String, rects: &[OverlayRect]) {
206    let Some((x0, y0, x1, y1)) = overlay_bounds(rects) else {
207        return;
208    };
209    let margin = 8.0;
210    let view_x = x0 - margin;
211    let view_y = y0 - margin;
212    let view_width = (x1 - x0 + margin * 2.0).max(1.0);
213    let view_height = (y1 - y0 + margin * 2.0).max(1.0);
214    output.push_str(&format!(
215        "<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"{view_x:.2} {view_y:.2} {view_width:.2} {view_height:.2}\" role=\"img\" aria-label=\"PDF user-space evidence overlay\">"
216    ));
217    for rect in rects {
218        let (x, y, width, height) = normalized_rect(rect.bbox);
219        output.push_str(&format!(
220            "<rect x=\"{x:.2}\" y=\"{y:.2}\" width=\"{width:.2}\" height=\"{height:.2}\" data-change=\"{}\" data-node=\"{}\"><title>{} {}</title></rect>",
221            escape_html(&rect.change_id),
222            escape_html(&rect.node_id),
223            escape_html(&rect.change_id),
224            escape_html(&rect.node_id)
225        ));
226        output.push_str(&format!(
227            "<text x=\"{:.2}\" y=\"{:.2}\">{}</text>",
228            x,
229            y - 2.0,
230            escape_html(&rect.change_id)
231        ));
232    }
233    output.push_str("</svg>");
234}
235
236fn overlay_bounds(rects: &[OverlayRect]) -> Option<(f32, f32, f32, f32)> {
237    let mut iter = rects.iter().map(|rect| normalized_rect(rect.bbox));
238    let (mut x0, mut y0, width, height) = iter.next()?;
239    let mut x1 = x0 + width;
240    let mut y1 = y0 + height;
241    for (x, y, width, height) in iter {
242        x0 = x0.min(x);
243        y0 = y0.min(y);
244        x1 = x1.max(x + width);
245        y1 = y1.max(y + height);
246    }
247    Some((x0, y0, x1, y1))
248}
249
250fn normalized_rect(rect: Rect) -> (f32, f32, f32, f32) {
251    let x0 = rect.x0.min(rect.x1);
252    let y0 = rect.y0.min(rect.y1);
253    let x1 = rect.x0.max(rect.x1);
254    let y1 = rect.y0.max(rect.y1);
255    (x0, y0, (x1 - x0).max(0.1), (y1 - y0).max(0.1))
256}
257
258fn is_reportable_rect(rect: Rect) -> bool {
259    rect.x0.is_finite()
260        && rect.y0.is_finite()
261        && rect.x1.is_finite()
262        && rect.y1.is_finite()
263        && (rect.x1 - rect.x0).abs() > 0.0
264        && (rect.y1 - rect.y0).abs() > 0.0
265}
266
267fn build_ai_review_item(change: &SemanticChange) -> AiReviewItem {
268    let tags = review_tags(change);
269    AiReviewItem {
270        change_id: change.id.clone(),
271        kind: change.kind.clone(),
272        severity: change.severity,
273        confidence: change.confidence,
274        confidence_bucket: confidence_bucket(change.confidence),
275        explanation: review_explanation(change, &tags),
276        evidence: evidence_bundle(change),
277        tags,
278    }
279}
280
281fn confidence_bucket(confidence: f32) -> AiConfidenceBucket {
282    if confidence >= 0.9 {
283        AiConfidenceBucket::High
284    } else if confidence >= 0.75 {
285        AiConfidenceBucket::Medium
286    } else {
287        AiConfidenceBucket::Low
288    }
289}
290
291fn review_tags(change: &SemanticChange) -> Vec<AiReviewTag> {
292    let mut tags = BTreeSet::new();
293    match change.kind {
294        ChangeKind::Inserted => {
295            tags.insert(AiReviewTag::ContentInserted);
296        }
297        ChangeKind::Deleted => {
298            tags.insert(AiReviewTag::ContentDeleted);
299        }
300        ChangeKind::Modified => {
301            tags.insert(AiReviewTag::TextChanged);
302        }
303        ChangeKind::Moved => {
304            tags.insert(AiReviewTag::ContentMoved);
305        }
306        ChangeKind::LayoutChanged => {
307            tags.insert(AiReviewTag::LayoutOnly);
308        }
309        ChangeKind::AnnotationChanged => {
310            tags.insert(AiReviewTag::AnnotationOrLinkChanged);
311        }
312        ChangeKind::FormFieldChanged => {
313            tags.insert(AiReviewTag::FormFieldChanged);
314        }
315        ChangeKind::MetadataChanged => {
316            tags.insert(AiReviewTag::MetadataChanged);
317        }
318        ChangeKind::ObjectChanged | ChangeKind::StyleChanged => {
319            tags.insert(AiReviewTag::VisualSurfaceChanged);
320        }
321        ChangeKind::Unknown => {}
322    }
323
324    let text = change_text(change);
325    let lower_text = text.to_lowercase();
326    if has_any(
327        &lower_text,
328        &[
329            "payment",
330            "invoice",
331            "amount",
332            "fee",
333            "price",
334            "revenue",
335            "total",
336            "usd",
337            "$",
338            "maintenance",
339            "schedule",
340        ],
341    ) {
342        tags.insert(AiReviewTag::PaymentTermsCandidate);
343    }
344    if has_any(
345        &lower_text,
346        &[
347            "day", "days", "date", "term", "notice", "year", "annual", "month", "weekly",
348        ],
349    ) {
350        tags.insert(AiReviewTag::DateOrDurationCandidate);
351    }
352    if has_any(
353        &lower_text,
354        &[
355            "corp",
356            "llc",
357            "inc",
358            "client",
359            "vendor",
360            "party",
361            "contractor",
362        ],
363    ) {
364        tags.insert(AiReviewTag::PartyNameCandidate);
365    }
366    if change.text_hunks.iter().any(hunk_has_digit_change) {
367        tags.insert(AiReviewTag::NumericValueChanged);
368    }
369    if is_repeated_page_region_change(change) {
370        tags.insert(AiReviewTag::RepeatedPageRegion);
371    }
372    if change.confidence < 0.75 {
373        tags.insert(AiReviewTag::LowConfidence);
374    }
375    if change.reason.contains("UNSUPPORTED_") {
376        tags.insert(AiReviewTag::UnsupportedSurface);
377    }
378
379    tags.into_iter().collect()
380}
381
382fn hunk_has_digit_change(hunk: &spdfdiff_types::TextHunk) -> bool {
383    hunk.old_text
384        .as_deref()
385        .is_some_and(|text| text.chars().any(|character| character.is_ascii_digit()))
386        || hunk
387            .new_text
388            .as_deref()
389            .is_some_and(|text| text.chars().any(|character| character.is_ascii_digit()))
390}
391
392fn is_repeated_page_region_change(change: &SemanticChange) -> bool {
393    [change.old_node.as_ref(), change.new_node.as_ref()]
394        .into_iter()
395        .flatten()
396        .filter_map(|node| node.semantic_role.as_deref())
397        .any(|role| {
398            matches!(
399                role,
400                "HeaderCandidate" | "FooterCandidate" | "PageTemplateCandidate"
401            )
402        })
403}
404
405fn has_any(value: &str, needles: &[&str]) -> bool {
406    needles.iter().any(|needle| value.contains(needle))
407}
408
409fn change_text(change: &SemanticChange) -> String {
410    [
411        change
412            .old_node
413            .as_ref()
414            .and_then(|node| node.text.as_deref())
415            .unwrap_or_default(),
416        change
417            .new_node
418            .as_ref()
419            .and_then(|node| node.text.as_deref())
420            .unwrap_or_default(),
421    ]
422    .join(" ")
423}
424
425fn review_explanation(change: &SemanticChange, tags: &[AiReviewTag]) -> String {
426    let mut parts = vec![match change.kind {
427        ChangeKind::Inserted => "Content was inserted.".to_owned(),
428        ChangeKind::Deleted => "Content was deleted.".to_owned(),
429        ChangeKind::Modified => "Text changed between matched semantic nodes.".to_owned(),
430        ChangeKind::Moved => {
431            "Content appears to have moved without a primary text change.".to_owned()
432        }
433        ChangeKind::LayoutChanged => {
434            "Layout changed while text evidence stayed comparable.".to_owned()
435        }
436        ChangeKind::StyleChanged => "A style-facing surface changed.".to_owned(),
437        ChangeKind::MetadataChanged => "A metadata-facing surface changed.".to_owned(),
438        ChangeKind::AnnotationChanged => "An annotation or link surface changed.".to_owned(),
439        ChangeKind::FormFieldChanged => "A form-field surface changed.".to_owned(),
440        ChangeKind::ObjectChanged => "A report-facing PDF object surface changed.".to_owned(),
441        ChangeKind::Unknown => "A change was detected but not classified further.".to_owned(),
442    }];
443
444    if tags.contains(&AiReviewTag::PaymentTermsCandidate) {
445        parts.push("Payment or amount terms are mentioned; treat this as a review candidate, not a legal conclusion.".into());
446    }
447    if tags.contains(&AiReviewTag::DateOrDurationCandidate) {
448        parts.push("Date, duration, or notice language is mentioned.".into());
449    }
450    if tags.contains(&AiReviewTag::RepeatedPageRegion) {
451        parts.push("The changed evidence is classified as repeated page-region content such as a header, footer, or page template candidate.".into());
452    }
453    if tags.contains(&AiReviewTag::LowConfidence) {
454        parts.push("Confidence is low; inspect extraction diagnostics and source evidence.".into());
455    }
456    parts.push(change.reason.clone());
457    parts.join(" ")
458}
459
460fn evidence_bundle(change: &SemanticChange) -> AiEvidenceBundle {
461    let mut provenance = Vec::new();
462    if let Some(old_node) = &change.old_node {
463        provenance.extend(old_node.source.clone());
464    }
465    if let Some(new_node) = &change.new_node {
466        provenance.extend(new_node.source.clone());
467    }
468
469    AiEvidenceBundle {
470        old_node_id: change.old_node.as_ref().map(|node| node.node_id.clone()),
471        new_node_id: change.new_node.as_ref().map(|node| node.node_id.clone()),
472        old_semantic_role: change
473            .old_node
474            .as_ref()
475            .and_then(|node| node.semantic_role.clone()),
476        new_semantic_role: change
477            .new_node
478            .as_ref()
479            .and_then(|node| node.semantic_role.clone()),
480        section_hint: section_hint(change),
481        old_page: change.old_node.as_ref().map(|node| node.page),
482        new_page: change.new_node.as_ref().map(|node| node.page),
483        old_bbox: change.old_node.as_ref().and_then(|node| node.bbox),
484        new_bbox: change.new_node.as_ref().and_then(|node| node.bbox),
485        old_text: change.old_node.as_ref().and_then(|node| node.text.clone()),
486        new_text: change.new_node.as_ref().and_then(|node| node.text.clone()),
487        text_hunks: change.text_hunks.clone(),
488        layout_diff: change.layout_diff.clone(),
489        provenance,
490    }
491}
492
493fn section_hint(change: &SemanticChange) -> Option<String> {
494    change
495        .new_node
496        .as_ref()
497        .and_then(|node| node.text.as_deref())
498        .and_then(section_hint_from_text)
499        .or_else(|| {
500            change
501                .old_node
502                .as_ref()
503                .and_then(|node| node.text.as_deref())
504                .and_then(section_hint_from_text)
505        })
506}
507
508fn section_hint_from_text(text: &str) -> Option<String> {
509    let trimmed = text.trim();
510    if trimmed.is_empty() {
511        return None;
512    }
513
514    let lower = trimmed.to_lowercase();
515    if lower.starts_with("section ") || lower.starts_with("clause ") {
516        return Some(first_words(trimmed, 10));
517    }
518
519    let first_token = trimmed.split_whitespace().next().unwrap_or_default();
520    let looks_numbered = first_token
521        .chars()
522        .any(|character| character.is_ascii_digit())
523        && (first_token.ends_with('.') || first_token.ends_with(')') || first_token.contains('.'));
524    if looks_numbered {
525        Some(first_words(trimmed, 10))
526    } else {
527        None
528    }
529}
530
531fn first_words(text: &str, limit: usize) -> String {
532    let mut value = text
533        .split_whitespace()
534        .take(limit)
535        .collect::<Vec<_>>()
536        .join(" ");
537    if value.len() > 96 {
538        value.truncate(96);
539        value = value.trim_end().to_owned();
540    }
541    value
542}
543
544fn build_question_hints(
545    review_items: &[AiReviewItem],
546    unsupported_surface_count: usize,
547) -> Vec<AiReviewQuestionHint> {
548    vec![
549        question_hint(
550            "Which contractual obligations changed?",
551            review_items,
552            |item| {
553                item.tags.iter().any(|tag| {
554                    matches!(
555                        tag,
556                        AiReviewTag::TextChanged
557                            | AiReviewTag::ContentInserted
558                            | AiReviewTag::ContentDeleted
559                            | AiReviewTag::ContentMoved
560                    )
561                }) && change_text_mentions_obligation(&item.evidence)
562            },
563            "Candidate obligation changes are based on obligation-like keywords and semantic change evidence.",
564        ),
565        question_hint(
566            "Were payment terms modified?",
567            review_items,
568            |item| item.tags.contains(&AiReviewTag::PaymentTermsCandidate),
569            "Payment-term candidates are based on payment, invoice, amount, or currency language in changed evidence.",
570        ),
571        question_hint(
572            "Did layout change without text changing?",
573            review_items,
574            |item| item.tags.contains(&AiReviewTag::LayoutOnly),
575            "Layout-only answers use changes classified separately from text modifications.",
576        ),
577        question_hint(
578            "Which changes are low-confidence because extraction was incomplete?",
579            review_items,
580            |item| item.tags.contains(&AiReviewTag::LowConfidence),
581            "Low-confidence answers use the engine confidence bucket and should be cross-checked with diagnostics.",
582        ),
583        question_hint(
584            "Did repeated page regions change?",
585            review_items,
586            |item| item.tags.contains(&AiReviewTag::RepeatedPageRegion),
587            "Repeated page-region answers use semantic header, footer, and page-template candidate evidence.",
588        ),
589        AiReviewQuestionHint {
590            question: "Were unsupported PDF surfaces encountered?".into(),
591            answer: if unsupported_surface_count > 0 {
592                AiReviewAnswer::Yes
593            } else {
594                AiReviewAnswer::No
595            },
596            supporting_change_ids: Vec::new(),
597            rationale: "Unsupported surfaces are counted from stable diagnostic codes that start with UNSUPPORTED_.".into(),
598        },
599    ]
600}
601
602fn question_hint(
603    question: &str,
604    review_items: &[AiReviewItem],
605    predicate: impl Fn(&AiReviewItem) -> bool,
606    rationale: &str,
607) -> AiReviewQuestionHint {
608    let supporting_change_ids = review_items
609        .iter()
610        .filter(|item| predicate(item))
611        .map(|item| item.change_id.clone())
612        .collect::<Vec<_>>();
613    AiReviewQuestionHint {
614        question: question.into(),
615        answer: if supporting_change_ids.is_empty() {
616            AiReviewAnswer::No
617        } else {
618            AiReviewAnswer::Yes
619        },
620        supporting_change_ids,
621        rationale: rationale.into(),
622    }
623}
624
625fn change_text_mentions_obligation(evidence: &AiEvidenceBundle) -> bool {
626    let text = [
627        evidence.old_text.as_deref().unwrap_or_default(),
628        evidence.new_text.as_deref().unwrap_or_default(),
629    ]
630    .join(" ")
631    .to_lowercase();
632    has_any(
633        &text,
634        &[
635            "shall",
636            "must",
637            "required",
638            "obligation",
639            "liable",
640            "liability",
641            "indemnification",
642            "termination",
643            "notice",
644            "payment",
645        ],
646    )
647}
648
649fn diagnostic_summary(document: &DiffDocument) -> Vec<AiDiagnosticCount> {
650    let mut counts = BTreeMap::new();
651    for diagnostic in &document.diagnostics {
652        *counts.entry(diagnostic.code.clone()).or_insert(0) += 1;
653    }
654    counts
655        .into_iter()
656        .map(|(code, count)| AiDiagnosticCount { code, count })
657        .collect()
658}
659
660#[must_use]
661pub fn to_markdown(document: &DiffDocument) -> String {
662    let mut output = format!(
663        "# Semantic PDF Diff\n\n| Metric | Count |\n| --- | ---: |\n| Inserted | {} |\n| Deleted | {} |\n| Modified | {} |\n| Moved | {} |\n| Layout changed | {} |\n\n",
664        document.summary.inserted,
665        document.summary.deleted,
666        document.summary.modified,
667        document.summary.moved,
668        document.summary.layout_changed
669    );
670
671    output.push_str("## Changes\n\n");
672    if document.changes.is_empty() {
673        output.push_str("No semantic changes detected.\n\n");
674    } else {
675        for change in &document.changes {
676            output.push_str(&format!(
677                "- `{}` {:?} {:?}: {}\n",
678                change.id, change.kind, change.severity, change.reason
679            ));
680            push_evidence_line(&mut output, "Old", change.old_node.as_ref());
681            push_evidence_line(&mut output, "New", change.new_node.as_ref());
682            if !change.text_hunks.is_empty() {
683                output.push_str("  - Text hunks:");
684                for hunk in &change.text_hunks {
685                    output.push_str(&format!(
686                        " `{}` \"{}\" -> \"{}\"",
687                        hunk_label(hunk),
688                        hunk.old_text.as_deref().unwrap_or_default(),
689                        hunk.new_text.as_deref().unwrap_or_default()
690                    ));
691                }
692                output.push('\n');
693            }
694            if let Some(layout_diff) = &change.layout_diff {
695                output.push_str(&format!(
696                    "  - Layout diff: {}\n",
697                    layout_diff_summary(layout_diff)
698                ));
699            }
700        }
701        output.push('\n');
702    }
703
704    output.push_str("## Diagnostics\n\n");
705    if document.diagnostics.is_empty() {
706        output.push_str("No diagnostics.\n");
707    } else {
708        for diagnostic in &document.diagnostics {
709            output.push_str(&format!(
710                "- `{:?}` `{}` {}\n",
711                diagnostic.severity, diagnostic.code, diagnostic.message
712            ));
713        }
714    }
715
716    output
717}
718
719fn push_html_evidence(
720    output: &mut String,
721    evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
722) {
723    let Some(evidence) = evidence else {
724        output.push_str("<em>None</em>");
725        return;
726    };
727    output.push_str(&format!(
728        "<div class=\"meta\">page {} <code>{}</code></div>",
729        evidence.page + 1,
730        escape_html(&evidence.node_id)
731    ));
732    if let Some(role) = &evidence.semantic_role {
733        output.push_str(&format!(
734            "<div class=\"meta\">semantic role <code>{}</code></div>",
735            escape_html(role)
736        ));
737    }
738    if let Some(bbox) = evidence.bbox {
739        output.push_str(&format!(
740            "<div class=\"meta\">bbox [{:.2}, {:.2}, {:.2}, {:.2}] in PDF user space</div>",
741            bbox.x0, bbox.y0, bbox.x1, bbox.y1
742        ));
743    }
744    if let Some(text) = &evidence.text {
745        output.push_str(&format!("<div>{}</div>", escape_html(text)));
746    }
747}
748
749fn escape_html(value: &str) -> String {
750    value
751        .replace('&', "&amp;")
752        .replace('<', "&lt;")
753        .replace('>', "&gt;")
754        .replace('"', "&quot;")
755}
756
757fn push_evidence_line(
758    output: &mut String,
759    label: &str,
760    evidence: Option<&spdfdiff_types::SemanticNodeEvidence>,
761) {
762    let Some(evidence) = evidence else {
763        return;
764    };
765    output.push_str(&format!(
766        "  - {label} page {} `{}`",
767        evidence.page + 1,
768        evidence.node_id
769    ));
770    if let Some(role) = &evidence.semantic_role {
771        output.push_str(&format!(" ({role})"));
772    }
773    if let Some(text) = &evidence.text {
774        output.push_str(&format!(": {text}"));
775    }
776    output.push('\n');
777}
778
779fn layout_diff_summary(layout_diff: &LayoutDiff) -> String {
780    let mut parts = Vec::new();
781    if let Some(delta_x) = layout_diff.delta_x {
782        parts.push(format!("dx={delta_x:.2}"));
783    }
784    if let Some(delta_y) = layout_diff.delta_y {
785        parts.push(format!("dy={delta_y:.2}"));
786    }
787    if let Some(delta_width) = layout_diff.delta_width {
788        parts.push(format!("dw={delta_width:.2}"));
789    }
790    if let Some(delta_height) = layout_diff.delta_height {
791        parts.push(format!("dh={delta_height:.2}"));
792    }
793    if layout_diff.page_changed {
794        parts.push("page_changed=true".to_owned());
795    }
796    if layout_diff.reading_order_changed {
797        parts.push("reading_order_changed=true".to_owned());
798    }
799    if parts.is_empty() {
800        "bbox changed without numeric delta".to_owned()
801    } else {
802        parts.join(", ")
803    }
804}
805
806fn hunk_label(hunk: &spdfdiff_types::TextHunk) -> String {
807    match &hunk.granularity {
808        Some(granularity) => format!("{:?}/{:?}", hunk.kind, granularity),
809        None => format!("{:?}", hunk.kind),
810    }
811}
812
813#[cfg(test)]
814mod tests {
815    use super::*;
816    use spdfdiff_types::{
817        ChangeKind, ChangeSeverity, Provenance, Rect, SemanticChange, SemanticNodeEvidence,
818        TextHunk, TextHunkKind,
819    };
820
821    #[test]
822    fn markdown_includes_summary_and_change_list() {
823        let mut document = DiffDocument::empty("old", "new");
824        document.summary.modified = 1;
825        document.changes.push(SemanticChange {
826            id: "change-0000".into(),
827            kind: ChangeKind::Modified,
828            severity: ChangeSeverity::Major,
829            old_node: Some(SemanticNodeEvidence {
830                node_id: "old-node".into(),
831                semantic_role: None,
832                page: 0,
833                bbox: Some(Rect {
834                    x0: 72.0,
835                    y0: 700.0,
836                    x1: 240.0,
837                    y1: 716.0,
838                }),
839                text: Some("Annual revenue was 10 million.".into()),
840                source: vec![Provenance::unknown()],
841            }),
842            new_node: Some(SemanticNodeEvidence {
843                node_id: "new-node".into(),
844                semantic_role: None,
845                page: 0,
846                bbox: Some(Rect {
847                    x0: 72.0,
848                    y0: 682.0,
849                    x1: 246.0,
850                    y1: 698.0,
851                }),
852                text: Some("Annual revenue was 12 million.".into()),
853                source: vec![Provenance::unknown()],
854            }),
855            text_hunks: vec![TextHunk {
856                kind: TextHunkKind::Replaced,
857                granularity: None,
858                old_range: None,
859                new_range: None,
860                old_text: Some("10".into()),
861                new_text: Some("12".into()),
862            }],
863            layout_diff: Some(LayoutDiff {
864                old_bbox: Some(Rect {
865                    x0: 72.0,
866                    y0: 700.0,
867                    x1: 240.0,
868                    y1: 716.0,
869                }),
870                new_bbox: Some(Rect {
871                    x0: 72.0,
872                    y0: 682.0,
873                    x1: 246.0,
874                    y1: 698.0,
875                }),
876                delta_x: Some(0.0),
877                delta_y: Some(-18.0),
878                delta_width: Some(6.0),
879                delta_height: Some(0.0),
880                page_changed: false,
881                reading_order_changed: false,
882            }),
883            confidence: 0.9,
884            reason: "paragraph text differs".into(),
885        });
886
887        let markdown = to_markdown(&document);
888
889        assert!(markdown.contains("| Modified | 1 |"));
890        assert!(markdown.contains("`change-0000` Modified Major"));
891        assert!(markdown.contains("Old page 1 `old-node`: Annual revenue was 10 million."));
892        assert!(markdown.contains("New page 1 `new-node`: Annual revenue was 12 million."));
893        assert!(markdown.contains("`Replaced` \"10\" -> \"12\""));
894        assert!(markdown.contains("Layout diff: dx=0.00, dy=-18.00, dw=6.00, dh=0.00"));
895    }
896
897    #[test]
898    fn html_is_self_contained_side_by_side_report() {
899        let mut document = DiffDocument::empty("old", "new");
900        document.summary.modified = 1;
901        document.changes.push(SemanticChange {
902            id: "change-0000".into(),
903            kind: ChangeKind::Modified,
904            severity: ChangeSeverity::Major,
905            old_node: Some(SemanticNodeEvidence {
906                node_id: "old-node".into(),
907                semantic_role: None,
908                page: 0,
909                bbox: Some(Rect {
910                    x0: 72.0,
911                    y0: 700.0,
912                    x1: 240.0,
913                    y1: 716.0,
914                }),
915                text: Some("Annual revenue was 10 million.".into()),
916                source: vec![Provenance::unknown()],
917            }),
918            new_node: Some(SemanticNodeEvidence {
919                node_id: "new-node".into(),
920                semantic_role: None,
921                page: 0,
922                bbox: Some(Rect {
923                    x0: 72.0,
924                    y0: 682.0,
925                    x1: 246.0,
926                    y1: 698.0,
927                }),
928                text: Some("Annual revenue was 12 million.".into()),
929                source: vec![Provenance::unknown()],
930            }),
931            text_hunks: Vec::new(),
932            layout_diff: Some(LayoutDiff {
933                old_bbox: Some(Rect {
934                    x0: 72.0,
935                    y0: 700.0,
936                    x1: 240.0,
937                    y1: 716.0,
938                }),
939                new_bbox: Some(Rect {
940                    x0: 72.0,
941                    y0: 682.0,
942                    x1: 246.0,
943                    y1: 698.0,
944                }),
945                delta_x: Some(0.0),
946                delta_y: Some(-18.0),
947                delta_width: Some(6.0),
948                delta_height: Some(0.0),
949                page_changed: false,
950                reading_order_changed: false,
951            }),
952            confidence: 0.9,
953            reason: "paragraph text differs".into(),
954        });
955
956        let html = to_html(&document);
957
958        assert!(html.contains("<!doctype html>"));
959        assert!(html.contains("<th>Old</th><th>New</th>"));
960        assert!(html.contains("<h2>Page Evidence Overlays</h2>"));
961        assert!(html.contains("<svg xmlns=\"http://www.w3.org/2000/svg\""));
962        assert!(html.contains("data-change=\"change-0000\""));
963        assert!(html.contains("bbox [72.00, 700.00, 240.00, 716.00] in PDF user space"));
964        assert!(html.contains("Layout diff"));
965        assert!(html.contains("dx=0.00, dy=-18.00, dw=6.00, dh=0.00"));
966        assert!(html.contains("Annual revenue was 10 million."));
967        assert!(html.contains("Annual revenue was 12 million."));
968        assert!(!html.contains("src=\"http"));
969        assert!(!html.contains("href=\"http"));
970    }
971
972    #[test]
973    fn ai_review_report_summarizes_questions_tags_and_evidence() {
974        let mut document = DiffDocument::empty("old.pdf", "new.pdf");
975        document.summary.modified = 1;
976        document.changes.push(SemanticChange {
977            id: "change-0000".into(),
978            kind: ChangeKind::Modified,
979            severity: ChangeSeverity::Major,
980            old_node: Some(SemanticNodeEvidence {
981                node_id: "old-node".into(),
982                semantic_role: None,
983                page: 0,
984                bbox: None,
985                text: Some("Payment is due within 30 days.".into()),
986                source: vec![Provenance::unknown()],
987            }),
988            new_node: Some(SemanticNodeEvidence {
989                node_id: "new-node".into(),
990                semantic_role: None,
991                page: 0,
992                bbox: None,
993                text: Some("Payment is due within 15 days.".into()),
994                source: vec![Provenance::unknown()],
995            }),
996            text_hunks: vec![TextHunk {
997                kind: TextHunkKind::Replaced,
998                granularity: None,
999                old_range: None,
1000                new_range: None,
1001                old_text: Some("30".into()),
1002                new_text: Some("15".into()),
1003            }],
1004            layout_diff: None,
1005            confidence: 0.91,
1006            reason: "paragraph text differs".into(),
1007        });
1008
1009        let report = build_ai_review_report(&document);
1010
1011        assert_eq!(report.summary.total_changes, 1);
1012        assert_eq!(
1013            report.review_items[0].confidence_bucket,
1014            AiConfidenceBucket::High
1015        );
1016        assert!(
1017            report.review_items[0]
1018                .tags
1019                .contains(&AiReviewTag::PaymentTermsCandidate)
1020        );
1021        assert!(
1022            report.review_items[0]
1023                .tags
1024                .contains(&AiReviewTag::NumericValueChanged)
1025        );
1026        assert_eq!(
1027            report.review_items[0].evidence.old_node_id.as_deref(),
1028            Some("old-node")
1029        );
1030        assert_eq!(
1031            report.review_items[0].evidence.new_node_id.as_deref(),
1032            Some("new-node")
1033        );
1034        assert_eq!(
1035            report.review_items[0].evidence.old_text.as_deref(),
1036            Some("Payment is due within 30 days.")
1037        );
1038        let payment_hint = report
1039            .question_hints
1040            .iter()
1041            .find(|hint| hint.question == "Were payment terms modified?")
1042            .expect("payment question hint should be present");
1043        assert_eq!(payment_hint.answer, AiReviewAnswer::Yes);
1044        assert_eq!(payment_hint.supporting_change_ids, vec!["change-0000"]);
1045    }
1046
1047    #[test]
1048    fn ai_review_report_tags_repeated_page_region_changes() {
1049        let mut document = DiffDocument::empty("old.pdf", "new.pdf");
1050        document.summary.modified = 1;
1051        document.changes.push(SemanticChange {
1052            id: "change-0000".into(),
1053            kind: ChangeKind::Modified,
1054            severity: ChangeSeverity::Minor,
1055            old_node: Some(SemanticNodeEvidence {
1056                node_id: "old-header".into(),
1057                semantic_role: Some("HeaderCandidate".into()),
1058                page: 0,
1059                bbox: None,
1060                text: Some("DocID: 994-A".into()),
1061                source: vec![Provenance::unknown()],
1062            }),
1063            new_node: Some(SemanticNodeEvidence {
1064                node_id: "new-header".into(),
1065                semantic_role: Some("HeaderCandidate".into()),
1066                page: 0,
1067                bbox: None,
1068                text: Some("DocID: 994-B".into()),
1069                source: vec![Provenance::unknown()],
1070            }),
1071            text_hunks: Vec::new(),
1072            layout_diff: None,
1073            confidence: 0.82,
1074            reason: "repeated header text differs".into(),
1075        });
1076
1077        let report = build_ai_review_report(&document);
1078
1079        assert!(
1080            report.review_items[0]
1081                .tags
1082                .contains(&AiReviewTag::RepeatedPageRegion)
1083        );
1084        assert_eq!(
1085            report.review_items[0].evidence.old_semantic_role.as_deref(),
1086            Some("HeaderCandidate")
1087        );
1088        assert_eq!(
1089            report.review_items[0].evidence.new_semantic_role.as_deref(),
1090            Some("HeaderCandidate")
1091        );
1092        assert!(
1093            report.review_items[0]
1094                .explanation
1095                .contains("repeated page-region content")
1096        );
1097        let hint = report
1098            .question_hints
1099            .iter()
1100            .find(|hint| hint.question == "Did repeated page regions change?")
1101            .expect("repeated page-region question hint should be present");
1102        assert_eq!(hint.answer, AiReviewAnswer::Yes);
1103        assert_eq!(hint.supporting_change_ids, vec!["change-0000"]);
1104    }
1105}