use std::collections::{BTreeMap, BTreeSet};
use pdf_semantic::{SemanticDocument, SemanticNode};
use spdfdiff_types::{
ChangeKind, ChangeSeverity, Diagnostic, DiffDocument, SemanticChange, SemanticNodeEvidence,
TextHunk, TextHunkKind,
};
const DEFAULT_MAX_MATCH_MATRIX_CELLS: usize = 1_000_000;
const DEFAULT_MAX_GREEDY_MATCH_CANDIDATES: usize = 256;
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct DiffConfig {
pub ignore_whitespace: bool,
pub ignore_case: bool,
pub detect_moves: bool,
pub layout_tolerance_pt: f32,
pub min_match_score: f32,
pub max_match_matrix_cells: usize,
pub max_greedy_match_candidates: usize,
}
impl Default for DiffConfig {
fn default() -> Self {
Self {
ignore_whitespace: true,
ignore_case: false,
detect_moves: true,
layout_tolerance_pt: 2.0,
min_match_score: 0.8,
max_match_matrix_cells: DEFAULT_MAX_MATCH_MATRIX_CELLS,
max_greedy_match_candidates: DEFAULT_MAX_GREEDY_MATCH_CANDIDATES,
}
}
}
pub trait SeverityClassifier {
fn classify(&self, change: &SemanticChange) -> ChangeSeverity;
}
#[derive(Debug, Clone, Copy, Default)]
pub struct DefaultSeverityClassifier;
impl SeverityClassifier for DefaultSeverityClassifier {
fn classify(&self, change: &SemanticChange) -> ChangeSeverity {
match change.kind {
ChangeKind::Inserted | ChangeKind::Deleted | ChangeKind::Modified => {
ChangeSeverity::Major
}
ChangeKind::Moved | ChangeKind::LayoutChanged | ChangeKind::StyleChanged => {
ChangeSeverity::Minor
}
ChangeKind::MetadataChanged | ChangeKind::ObjectChanged => ChangeSeverity::Info,
ChangeKind::AnnotationChanged | ChangeKind::FormFieldChanged | ChangeKind::Unknown => {
ChangeSeverity::Major
}
}
}
}
#[must_use]
pub fn diff_semantic_documents(
old: &SemanticDocument,
new: &SemanticDocument,
config: DiffConfig,
) -> DiffDocument {
let classifier = DefaultSeverityClassifier;
diff_semantic_documents_with_classifier(old, new, config, &classifier)
}
#[must_use]
pub fn diff_semantic_documents_with_classifier(
old: &SemanticDocument,
new: &SemanticDocument,
config: DiffConfig,
classifier: &impl SeverityClassifier,
) -> DiffDocument {
let mut document = DiffDocument::empty(&old.fingerprint, &new.fingerprint);
document.diagnostics.extend(old.diagnostics.clone());
document.diagnostics.extend(new.diagnostics.clone());
let old_texts = old
.nodes
.iter()
.map(|node| comparable_text(node, config))
.collect::<Vec<_>>();
let new_texts = new
.nodes
.iter()
.map(|node| comparable_text(node, config))
.collect::<Vec<_>>();
let exact_matches = exact_text_matches(&old_texts, &new_texts, config);
if let Some(diagnostic) = exact_matches.diagnostic {
document.diagnostics.push(diagnostic);
}
let mut matches = exact_matches.matches;
matches.push((old.nodes.len(), new.nodes.len()));
let mut old_start = 0;
let mut new_start = 0;
for (old_end, new_end) in matches {
emit_unmatched_range(
old,
new,
old_start..old_end,
new_start..new_end,
config,
&mut document,
classifier,
);
if old_end < old.nodes.len() && new_end < new.nodes.len() {
emit_layout_change_if_needed(
&old.nodes[old_end],
&new.nodes[new_end],
config,
&mut document,
classifier,
);
}
old_start = old_end + 1;
new_start = new_end + 1;
}
if config.detect_moves {
relabel_insert_delete_pairs_as_moves(&mut document, config, classifier);
}
document
}
#[derive(Debug, Clone, PartialEq)]
struct ExactMatchResult {
matches: Vec<(usize, usize)>,
diagnostic: Option<Diagnostic>,
}
fn exact_text_matches(
old_texts: &[String],
new_texts: &[String],
config: DiffConfig,
) -> ExactMatchResult {
let old_len = old_texts.len();
let new_len = new_texts.len();
if matrix_cell_count_exceeds_limit(old_len, new_len, config.max_match_matrix_cells) {
return ExactMatchResult {
matches: greedy_exact_text_matches(old_texts, new_texts),
diagnostic: Some(match_limit_diagnostic(
"EXACT_MATCH_LIMIT_EXCEEDED",
old_len,
new_len,
config.max_match_matrix_cells,
"exact text anchor matrix",
"greedy exact-anchor fallback",
)),
};
}
let mut lengths = vec![vec![0usize; new_texts.len() + 1]; old_texts.len() + 1];
for old_index in (0..old_texts.len()).rev() {
for new_index in (0..new_texts.len()).rev() {
lengths[old_index][new_index] = if old_texts[old_index] == new_texts[new_index] {
lengths[old_index + 1][new_index + 1] + 1
} else {
lengths[old_index + 1][new_index].max(lengths[old_index][new_index + 1])
};
}
}
let mut matches = Vec::new();
let mut old_index = 0;
let mut new_index = 0;
while old_index < old_texts.len() && new_index < new_texts.len() {
if old_texts[old_index] == new_texts[new_index] {
matches.push((old_index, new_index));
old_index += 1;
new_index += 1;
} else if lengths[old_index + 1][new_index] >= lengths[old_index][new_index + 1] {
old_index += 1;
} else {
new_index += 1;
}
}
ExactMatchResult {
matches,
diagnostic: None,
}
}
fn greedy_exact_text_matches(old_texts: &[String], new_texts: &[String]) -> Vec<(usize, usize)> {
let mut new_indices_by_text = BTreeMap::<&str, Vec<usize>>::new();
for (new_index, text) in new_texts.iter().enumerate() {
if !text.is_empty() {
new_indices_by_text
.entry(text.as_str())
.or_default()
.push(new_index);
}
}
let mut matches = Vec::new();
let mut last_new_index = None;
for (old_index, text) in old_texts.iter().enumerate() {
let Some(new_indices) = new_indices_by_text.get(text.as_str()) else {
continue;
};
let lower_bound = last_new_index.map_or(0, |index| index + 1);
let position = new_indices.partition_point(|index| *index < lower_bound);
let Some(new_index) = new_indices.get(position).copied() else {
continue;
};
matches.push((old_index, new_index));
last_new_index = Some(new_index);
}
matches
}
fn emit_unmatched_range(
old: &SemanticDocument,
new: &SemanticDocument,
old_range: std::ops::Range<usize>,
new_range: std::ops::Range<usize>,
config: DiffConfig,
document: &mut DiffDocument,
classifier: &impl SeverityClassifier,
) {
let matches = fuzzy_node_matches(old, new, old_range.clone(), new_range.clone(), config);
if let Some(diagnostic) = matches.diagnostic {
document.diagnostics.push(diagnostic);
}
let mut old_cursor = old_range.start;
let mut new_cursor = new_range.start;
for fuzzy_match in matches.matches {
emit_unpaired_changes(
old,
new,
old_cursor..fuzzy_match.old_index,
new_cursor..fuzzy_match.new_index,
document,
classifier,
);
let old_node = &old.nodes[fuzzy_match.old_index];
let new_node = &new.nodes[fuzzy_match.new_index];
let text_hunks = text_hunks_for_nodes(old_node, new_node, config);
let reason = modified_reason_with_hunks(old_node, new_node, config, &text_hunks);
push_change_with_confidence(
document,
ChangeKind::Modified,
Some(old_node),
Some(new_node),
ChangeDetails {
reason: format!("{reason}; fuzzy_match_score={:.3}", fuzzy_match.score),
confidence: fuzzy_match.score,
text_hunks,
},
classifier,
);
old_cursor = fuzzy_match.old_index + 1;
new_cursor = fuzzy_match.new_index + 1;
}
emit_unpaired_changes(
old,
new,
old_cursor..old_range.end,
new_cursor..new_range.end,
document,
classifier,
);
}
#[derive(Debug, Clone, Copy, PartialEq)]
struct FuzzyMatch {
old_index: usize,
new_index: usize,
score: f32,
}
#[derive(Debug, Clone, PartialEq)]
struct FuzzyMatchResult {
matches: Vec<FuzzyMatch>,
diagnostic: Option<Diagnostic>,
}
fn fuzzy_node_matches(
old: &SemanticDocument,
new: &SemanticDocument,
old_range: std::ops::Range<usize>,
new_range: std::ops::Range<usize>,
config: DiffConfig,
) -> FuzzyMatchResult {
let old_len = old_range.len();
let new_len = new_range.len();
if matrix_cell_count_exceeds_limit(old_len, new_len, config.max_match_matrix_cells) {
return FuzzyMatchResult {
matches: greedy_fuzzy_node_matches(old, new, old_range, new_range, config),
diagnostic: Some(match_limit_diagnostic(
"FUZZY_MATCH_LIMIT_EXCEEDED",
old_len,
new_len,
config.max_match_matrix_cells,
"fuzzy node match matrix",
"bounded greedy fuzzy-match fallback",
)),
};
}
let mut scores = vec![vec![0.0f32; new_len]; old_len];
for (old_offset, row) in scores.iter_mut().enumerate() {
for (new_offset, score) in row.iter_mut().enumerate() {
*score = fuzzy_match_score(
&old.nodes[old_range.start + old_offset],
&new.nodes[new_range.start + new_offset],
config,
);
}
}
let mut best = vec![vec![0.0f32; new_len + 1]; old_len + 1];
for old_offset in (0..old_len).rev() {
for new_offset in (0..new_len).rev() {
let match_score = if scores[old_offset][new_offset] >= config.min_match_score {
scores[old_offset][new_offset] + best[old_offset + 1][new_offset + 1]
} else {
-1.0
};
best[old_offset][new_offset] = match_score
.max(best[old_offset + 1][new_offset])
.max(best[old_offset][new_offset + 1]);
}
}
let mut matches = Vec::new();
let mut old_offset = 0;
let mut new_offset = 0;
while old_offset < old_len && new_offset < new_len {
let score = scores[old_offset][new_offset];
let match_score = score + best[old_offset + 1][new_offset + 1];
if score >= config.min_match_score
&& approximately_equal(best[old_offset][new_offset], match_score)
{
matches.push(FuzzyMatch {
old_index: old_range.start + old_offset,
new_index: new_range.start + new_offset,
score,
});
old_offset += 1;
new_offset += 1;
} else if best[old_offset + 1][new_offset] >= best[old_offset][new_offset + 1] {
old_offset += 1;
} else {
new_offset += 1;
}
}
FuzzyMatchResult {
matches,
diagnostic: None,
}
}
fn greedy_fuzzy_node_matches(
old: &SemanticDocument,
new: &SemanticDocument,
old_range: std::ops::Range<usize>,
new_range: std::ops::Range<usize>,
config: DiffConfig,
) -> Vec<FuzzyMatch> {
let mut matches = Vec::new();
let mut new_cursor = new_range.start;
let candidate_window = config.max_greedy_match_candidates.max(1);
for old_index in old_range {
if new_cursor >= new_range.end {
break;
}
let scan_end = new_cursor
.saturating_add(candidate_window)
.min(new_range.end);
let mut best_match = None;
for new_index in new_cursor..scan_end {
let score = fuzzy_match_score(&old.nodes[old_index], &new.nodes[new_index], config);
if score < config.min_match_score {
continue;
}
let should_replace = best_match
.as_ref()
.is_none_or(|current: &FuzzyMatch| score > current.score);
if should_replace {
best_match = Some(FuzzyMatch {
old_index,
new_index,
score,
});
if approximately_equal(score, 1.0) {
break;
}
}
}
if let Some(fuzzy_match) = best_match {
new_cursor = fuzzy_match.new_index + 1;
matches.push(fuzzy_match);
}
}
matches
}
fn matrix_cell_count_exceeds_limit(old_len: usize, new_len: usize, limit: usize) -> bool {
old_len
.checked_mul(new_len)
.is_none_or(|cell_count| cell_count > limit)
}
fn match_limit_diagnostic(
code: &'static str,
old_len: usize,
new_len: usize,
limit: usize,
matrix_name: &'static str,
fallback_name: &'static str,
) -> Diagnostic {
let cell_count = old_len
.checked_mul(new_len)
.map_or("overflow".to_owned(), |count| count.to_string());
Diagnostic::warning(
code,
format!(
"{matrix_name} requires {cell_count} cells for {old_len} old nodes and {new_len} new nodes, exceeding limit {limit}; using {fallback_name}"
),
)
}
fn approximately_equal(left: f32, right: f32) -> bool {
(left - right).abs() <= f32::EPSILON * 16.0
}
fn fuzzy_match_score(old_node: &SemanticNode, new_node: &SemanticNode, config: DiffConfig) -> f32 {
if old_node.page_index != new_node.page_index {
return 0.0;
}
let old_text = comparable_text(old_node, config);
let new_text = comparable_text(new_node, config);
let old_tokens = normalized_tokens(
old_node.normalized_text.as_deref().unwrap_or_default(),
config,
);
let new_tokens = normalized_tokens(
new_node.normalized_text.as_deref().unwrap_or_default(),
config,
);
let score = token_similarity(&old_tokens, &new_tokens);
if is_text_extension(&old_text, &new_text) {
score.max(0.85)
} else {
score
}
}
fn is_text_extension(left: &str, right: &str) -> bool {
if left.is_empty() || right.is_empty() || left == right {
return false;
}
left.starts_with(right) || right.starts_with(left)
}
fn token_similarity(old_tokens: &[String], new_tokens: &[String]) -> f32 {
if old_tokens.is_empty() && new_tokens.is_empty() {
return 1.0;
}
if old_tokens.is_empty() || new_tokens.is_empty() {
return 0.0;
}
let common = token_lcs_len(old_tokens, new_tokens);
let score = (2.0 * common as f32) / (old_tokens.len() + new_tokens.len()) as f32;
if old_tokens.len() == new_tokens.len() && common + 1 == old_tokens.len() {
score.max(0.8)
} else {
score
}
}
fn token_lcs_len(left: &[String], right: &[String]) -> usize {
let mut lengths = vec![vec![0usize; right.len() + 1]; left.len() + 1];
for left_index in (0..left.len()).rev() {
for right_index in (0..right.len()).rev() {
lengths[left_index][right_index] = if left[left_index] == right[right_index] {
lengths[left_index + 1][right_index + 1] + 1
} else {
lengths[left_index + 1][right_index].max(lengths[left_index][right_index + 1])
};
}
}
lengths[0][0]
}
fn emit_unpaired_changes(
old: &SemanticDocument,
new: &SemanticDocument,
old_range: std::ops::Range<usize>,
new_range: std::ops::Range<usize>,
document: &mut DiffDocument,
classifier: &impl SeverityClassifier,
) {
for old_index in old_range {
push_change(
document,
ChangeKind::Deleted,
Some(&old.nodes[old_index]),
None,
"paragraph exists only in old document",
classifier,
);
}
for new_index in new_range {
push_change(
document,
ChangeKind::Inserted,
None,
Some(&new.nodes[new_index]),
"paragraph exists only in new document",
classifier,
);
}
}
fn push_change(
document: &mut DiffDocument,
kind: ChangeKind,
old_node: Option<&SemanticNode>,
new_node: Option<&SemanticNode>,
reason: &str,
classifier: &impl SeverityClassifier,
) {
push_change_with_confidence(
document,
kind,
old_node,
new_node,
ChangeDetails {
reason: reason.to_owned(),
confidence: 0.9,
text_hunks: Vec::new(),
},
classifier,
);
}
struct ChangeDetails {
reason: String,
confidence: f32,
text_hunks: Vec<TextHunk>,
}
fn push_change_with_confidence(
document: &mut DiffDocument,
kind: ChangeKind,
old_node: Option<&SemanticNode>,
new_node: Option<&SemanticNode>,
details: ChangeDetails,
classifier: &impl SeverityClassifier,
) {
let kind_for_summary = kind.clone();
let mut change = SemanticChange {
id: format!("change-{:04}", document.changes.len()),
kind,
severity: ChangeSeverity::Info,
old_node: old_node.map(to_evidence),
new_node: new_node.map(to_evidence),
text_hunks: details.text_hunks,
confidence: details.confidence,
reason: details.reason,
};
change.severity = classifier.classify(&change);
match kind_for_summary {
ChangeKind::Inserted => document.summary.inserted += 1,
ChangeKind::Deleted => document.summary.deleted += 1,
ChangeKind::Modified => document.summary.modified += 1,
ChangeKind::Moved => document.summary.moved += 1,
ChangeKind::LayoutChanged => document.summary.layout_changed += 1,
ChangeKind::StyleChanged
| ChangeKind::MetadataChanged
| ChangeKind::AnnotationChanged
| ChangeKind::FormFieldChanged
| ChangeKind::ObjectChanged
| ChangeKind::Unknown => {}
}
document.changes.push(change);
}
fn emit_layout_change_if_needed(
old_node: &SemanticNode,
new_node: &SemanticNode,
config: DiffConfig,
document: &mut DiffDocument,
classifier: &impl SeverityClassifier,
) {
if !layout_changed(old_node, new_node, config.layout_tolerance_pt) {
return;
}
push_change(
document,
ChangeKind::LayoutChanged,
Some(old_node),
Some(new_node),
"paragraph text is unchanged but page or bounding box moved beyond tolerance",
classifier,
);
}
fn layout_changed(old_node: &SemanticNode, new_node: &SemanticNode, tolerance: f32) -> bool {
if old_node.page_index != new_node.page_index {
return true;
}
match (old_node.bbox, new_node.bbox) {
(Some(old_bbox), Some(new_bbox)) => {
(old_bbox.x0 - new_bbox.x0).abs() > tolerance
|| (old_bbox.y0 - new_bbox.y0).abs() > tolerance
|| (old_bbox.x1 - new_bbox.x1).abs() > tolerance
|| (old_bbox.y1 - new_bbox.y1).abs() > tolerance
}
(Some(_), None) | (None, Some(_)) => true,
(None, None) => false,
}
}
fn comparable_text(node: &SemanticNode, config: DiffConfig) -> String {
let text = node.normalized_text.clone().unwrap_or_default();
let text = if config.ignore_whitespace {
text.split_whitespace().collect::<Vec<_>>().join(" ")
} else {
text
};
if config.ignore_case {
text.to_lowercase()
} else {
text
}
}
fn modified_reason_with_hunks(
old_node: &SemanticNode,
new_node: &SemanticNode,
config: DiffConfig,
text_hunks: &[TextHunk],
) -> String {
let old_text = old_node.normalized_text.as_deref().unwrap_or_default();
let new_text = new_node.normalized_text.as_deref().unwrap_or_default();
let old_tokens = normalized_tokens(old_text, config);
let new_tokens = normalized_tokens(new_text, config);
if old_tokens == new_tokens {
return "paragraph text differs between exact-match anchors".into();
}
let first_change = text_hunks
.iter()
.find(|hunk| hunk.kind != TextHunkKind::Equal);
if let Some(hunk) = first_change {
return format!(
"paragraph text differs between exact-match anchors (old: \"{}\" -> new: \"{}\")",
hunk.old_text.as_deref().unwrap_or_default(),
hunk.new_text.as_deref().unwrap_or_default()
);
}
"paragraph text differs between exact-match anchors".into()
}
fn text_hunks_for_nodes(
old_node: &SemanticNode,
new_node: &SemanticNode,
config: DiffConfig,
) -> Vec<TextHunk> {
let old_tokens = normalized_tokens(
old_node.normalized_text.as_deref().unwrap_or_default(),
config,
);
let new_tokens = normalized_tokens(
new_node.normalized_text.as_deref().unwrap_or_default(),
config,
);
text_hunks_from_tokens(&old_tokens, &new_tokens)
}
fn text_hunks_from_tokens(old_tokens: &[String], new_tokens: &[String]) -> Vec<TextHunk> {
let mut lengths = vec![vec![0usize; new_tokens.len() + 1]; old_tokens.len() + 1];
for old_index in (0..old_tokens.len()).rev() {
for new_index in (0..new_tokens.len()).rev() {
lengths[old_index][new_index] = if old_tokens[old_index] == new_tokens[new_index] {
lengths[old_index + 1][new_index + 1] + 1
} else {
lengths[old_index + 1][new_index].max(lengths[old_index][new_index + 1])
};
}
}
let mut hunks = Vec::new();
let mut deleted = Vec::new();
let mut inserted = Vec::new();
let mut old_index = 0;
let mut new_index = 0;
while old_index < old_tokens.len() || new_index < new_tokens.len() {
if old_index < old_tokens.len()
&& new_index < new_tokens.len()
&& old_tokens[old_index] == new_tokens[new_index]
{
flush_change_hunk(&mut hunks, &mut deleted, &mut inserted);
push_or_merge_hunk(
&mut hunks,
TextHunkKind::Equal,
Some(old_tokens[old_index].clone()),
Some(new_tokens[new_index].clone()),
);
old_index += 1;
new_index += 1;
} else if new_index >= new_tokens.len()
|| (old_index < old_tokens.len()
&& lengths[old_index + 1][new_index] >= lengths[old_index][new_index + 1])
{
deleted.push(old_tokens[old_index].clone());
old_index += 1;
} else {
inserted.push(new_tokens[new_index].clone());
new_index += 1;
}
}
flush_change_hunk(&mut hunks, &mut deleted, &mut inserted);
hunks
}
fn flush_change_hunk(
hunks: &mut Vec<TextHunk>,
deleted: &mut Vec<String>,
inserted: &mut Vec<String>,
) {
if deleted.is_empty() && inserted.is_empty() {
return;
}
let kind = match (deleted.is_empty(), inserted.is_empty()) {
(false, false) => TextHunkKind::Replaced,
(false, true) => TextHunkKind::Deleted,
(true, false) => TextHunkKind::Inserted,
(true, true) => unreachable!(),
};
push_or_merge_hunk(
hunks,
kind,
(!deleted.is_empty()).then(|| deleted.join(" ")),
(!inserted.is_empty()).then(|| inserted.join(" ")),
);
deleted.clear();
inserted.clear();
}
fn push_or_merge_hunk(
hunks: &mut Vec<TextHunk>,
kind: TextHunkKind,
old_text: Option<String>,
new_text: Option<String>,
) {
if let Some(last) = hunks.last_mut() {
if last.kind == kind {
append_optional_text(&mut last.old_text, old_text);
append_optional_text(&mut last.new_text, new_text);
return;
}
}
hunks.push(TextHunk {
kind,
old_text,
new_text,
});
}
fn append_optional_text(target: &mut Option<String>, addition: Option<String>) {
let Some(addition) = addition else {
return;
};
match target {
Some(target) if !target.is_empty() => {
target.push(' ');
target.push_str(&addition);
}
Some(target) => target.push_str(&addition),
None => *target = Some(addition),
}
}
fn normalized_tokens(text: &str, config: DiffConfig) -> Vec<String> {
let base = if config.ignore_whitespace {
text.split_whitespace().collect::<Vec<_>>().join(" ")
} else {
text.to_string()
};
let normalized = if config.ignore_case {
base.to_lowercase()
} else {
base
};
normalized
.split_whitespace()
.map(ToOwned::to_owned)
.collect()
}
fn relabel_insert_delete_pairs_as_moves(
document: &mut DiffDocument,
config: DiffConfig,
classifier: &impl SeverityClassifier,
) {
let mut deleted_candidates = document
.changes
.iter()
.enumerate()
.filter_map(|(index, change)| (change.kind == ChangeKind::Deleted).then_some(index))
.collect::<Vec<_>>();
let mut consumed_deleted = BTreeSet::new();
for insert_index in 0..document.changes.len() {
if document.changes[insert_index].kind != ChangeKind::Inserted {
continue;
}
let Some(insert_text) = document.changes[insert_index]
.new_node
.as_ref()
.and_then(|node| node.text.as_ref())
else {
continue;
};
let insert_text = normalize_text_for_match(insert_text, config);
let Some((position, deleted_index)) =
deleted_candidates
.iter()
.enumerate()
.find_map(|(position, deleted_index)| {
let deleted_text = document.changes[*deleted_index]
.old_node
.as_ref()
.and_then(|node| node.text.as_ref())?;
(normalize_text_for_match(deleted_text, config) == insert_text)
.then_some((position, *deleted_index))
})
else {
continue;
};
let old_evidence = document.changes[deleted_index].old_node.clone();
let change = &mut document.changes[insert_index];
change.kind = ChangeKind::Moved;
change.old_node = old_evidence;
change.reason = "paragraph text moved to a different reading-order position".into();
change.severity = classifier.classify(change);
consumed_deleted.insert(deleted_index);
deleted_candidates.remove(position);
}
if consumed_deleted.is_empty() {
return;
}
let retained = document
.changes
.iter()
.enumerate()
.filter_map(|(index, change)| {
(!consumed_deleted.contains(&index)).then_some(change.clone())
})
.collect::<Vec<_>>();
document.changes = retained;
renumber_and_recompute_summary(document);
}
fn normalize_text_for_match(text: &str, config: DiffConfig) -> String {
let text = if config.ignore_whitespace {
text.split_whitespace().collect::<Vec<_>>().join(" ")
} else {
text.to_string()
};
if config.ignore_case {
text.to_lowercase()
} else {
text
}
}
fn renumber_and_recompute_summary(document: &mut DiffDocument) {
document.summary.inserted = 0;
document.summary.deleted = 0;
document.summary.modified = 0;
document.summary.moved = 0;
document.summary.layout_changed = 0;
for (index, change) in document.changes.iter_mut().enumerate() {
change.id = format!("change-{index:04}");
match change.kind {
ChangeKind::Inserted => document.summary.inserted += 1,
ChangeKind::Deleted => document.summary.deleted += 1,
ChangeKind::Modified => document.summary.modified += 1,
ChangeKind::Moved => document.summary.moved += 1,
ChangeKind::LayoutChanged => document.summary.layout_changed += 1,
_ => {}
}
}
}
fn to_evidence(node: &SemanticNode) -> SemanticNodeEvidence {
SemanticNodeEvidence {
node_id: node.id.clone(),
page: node.page_index,
bbox: node.bbox,
text: node.normalized_text.clone(),
source: node.source.clone(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use pdf_semantic::{SemanticAnchor, SemanticNode, SemanticNodeKind};
use spdfdiff_types::Provenance;
#[test]
fn detects_modified_paragraph() {
let old = document_with_text("old", "Hello");
let new = document_with_text("new", "Hello world");
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.modified, 1);
assert_eq!(diff.changes[0].id, "change-0000");
assert_eq!(diff.changes[0].kind, ChangeKind::Modified);
assert!(
diff.changes[0]
.reason
.contains("old: \"\" -> new: \"world\"")
);
assert_eq!(diff.changes[0].text_hunks.len(), 2);
assert_eq!(diff.changes[0].text_hunks[1].kind, TextHunkKind::Inserted);
assert_eq!(
diff.changes[0].text_hunks[1].new_text.as_deref(),
Some("world")
);
assert!(diff.changes[0].reason.contains("fuzzy_match_score=0.850"));
assert_ne!(diff.changes[0].severity, ChangeSeverity::Critical);
}
#[test]
fn anchors_unchanged_text_around_inserted_paragraph() {
let old = document_with_texts("old", &["Alpha", "Beta"]);
let new = document_with_texts("new", &["Alpha", "Inserted", "Beta"]);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.inserted, 1);
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.changes.len(), 1);
assert_eq!(diff.changes[0].id, "change-0000");
assert_eq!(diff.changes[0].kind, ChangeKind::Inserted);
assert_eq!(
diff.changes[0].new_node.as_ref().unwrap().text.as_deref(),
Some("Inserted")
);
}
#[test]
fn anchors_unchanged_text_around_deleted_paragraph() {
let old = document_with_texts("old", &["Alpha", "Deleted", "Beta"]);
let new = document_with_texts("new", &["Alpha", "Beta"]);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.deleted, 1);
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.changes.len(), 1);
assert_eq!(diff.changes[0].kind, ChangeKind::Deleted);
assert_eq!(
diff.changes[0].old_node.as_ref().unwrap().text.as_deref(),
Some("Deleted")
);
}
#[test]
fn fuzzy_matches_edited_paragraph_between_exact_anchors() {
let old = document_with_texts("old", &["Alpha", "Payment is due in 30 days", "Omega"]);
let new = document_with_texts("new", &["Alpha", "Payment is due in 15 days", "Omega"]);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.modified, 1);
assert_eq!(diff.summary.inserted, 0);
assert_eq!(diff.summary.deleted, 0);
assert_eq!(diff.changes[0].kind, ChangeKind::Modified);
assert!((diff.changes[0].confidence - 0.833).abs() < 0.001);
assert!(diff.changes[0].reason.contains("fuzzy_match_score=0.833"));
assert!(
diff.changes[0]
.text_hunks
.iter()
.any(|hunk| hunk.kind == TextHunkKind::Replaced
&& hunk.old_text.as_deref() == Some("30")
&& hunk.new_text.as_deref() == Some("15"))
);
assert_eq!(
diff.changes[0].old_node.as_ref().unwrap().text.as_deref(),
Some("Payment is due in 30 days")
);
assert_eq!(
diff.changes[0].new_node.as_ref().unwrap().text.as_deref(),
Some("Payment is due in 15 days")
);
}
#[test]
fn leaves_low_confidence_unmatched_blocks_as_delete_insert() {
let old = document_with_texts("old", &["Alpha", "Cat dog", "Omega"]);
let new = document_with_texts("new", &["Alpha", "Invoice total", "Omega"]);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.summary.deleted, 1);
assert_eq!(diff.summary.inserted, 1);
assert_eq!(diff.changes[0].kind, ChangeKind::Deleted);
assert_eq!(diff.changes[1].kind, ChangeKind::Inserted);
}
#[test]
fn fuzzy_matching_respects_minimum_score() {
let old = document_with_texts("old", &["Alpha", "Payment is due in 30 days", "Omega"]);
let new = document_with_texts("new", &["Alpha", "Payment is due in 15 days", "Omega"]);
let config = DiffConfig {
min_match_score: 0.95,
..DiffConfig::default()
};
let diff = diff_semantic_documents(&old, &new, config);
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.summary.deleted, 1);
assert_eq!(diff.summary.inserted, 1);
}
#[test]
fn exact_matching_falls_back_when_matrix_limit_is_exceeded() {
let old = document_with_texts("old", &["Alpha", "Beta", "Gamma"]);
let new = document_with_texts("new", &["Inserted before", "Alpha", "Beta", "Gamma"]);
let config = DiffConfig {
max_match_matrix_cells: 1,
..DiffConfig::default()
};
let diff = diff_semantic_documents(&old, &new, config);
assert_eq!(diff.summary.inserted, 1);
assert_eq!(diff.summary.deleted, 0);
assert_eq!(diff.summary.modified, 0);
assert!(
diff.diagnostics
.iter()
.any(|diagnostic| diagnostic.code == "EXACT_MATCH_LIMIT_EXCEEDED")
);
}
#[test]
fn fuzzy_matching_falls_back_when_matrix_limit_is_exceeded() {
let old = document_with_texts("old", &["Alpha", "Payment is due in 30 days", "Omega"]);
let new = document_with_texts("new", &["Alpha", "Payment is due in 15 days", "Omega"]);
let config = DiffConfig {
max_match_matrix_cells: 0,
..DiffConfig::default()
};
let diff = diff_semantic_documents(&old, &new, config);
assert_eq!(diff.summary.modified, 1);
assert_eq!(diff.summary.inserted, 0);
assert_eq!(diff.summary.deleted, 0);
assert!(
diff.diagnostics
.iter()
.any(|diagnostic| diagnostic.code == "FUZZY_MATCH_LIMIT_EXCEEDED")
);
}
#[test]
fn bounded_matching_handles_many_unmatched_nodes_without_large_matrices() {
let old_texts = (0..600)
.map(|index| format!("Old paragraph {index} alpha beta"))
.collect::<Vec<_>>();
let new_texts = (0..600)
.map(|index| format!("New paragraph {index} gamma delta"))
.collect::<Vec<_>>();
let old = document_with_owned_texts("old", old_texts);
let new = document_with_owned_texts("new", new_texts);
let config = DiffConfig {
max_match_matrix_cells: 64,
max_greedy_match_candidates: 8,
..DiffConfig::default()
};
let diff = diff_semantic_documents(&old, &new, config);
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.summary.deleted, 600);
assert_eq!(diff.summary.inserted, 600);
assert!(
diff.diagnostics
.iter()
.any(|diagnostic| diagnostic.code == "EXACT_MATCH_LIMIT_EXCEEDED")
);
assert!(
diff.diagnostics
.iter()
.any(|diagnostic| diagnostic.code == "FUZZY_MATCH_LIMIT_EXCEEDED")
);
}
#[test]
fn detects_moved_paragraph_from_insert_delete_pair() {
let old = document_with_texts("old", &["Alpha", "Beta", "Gamma"]);
let new = document_with_texts("new", &["Beta", "Alpha", "Gamma"]);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.moved, 1);
assert_eq!(diff.summary.inserted, 0);
assert_eq!(diff.summary.deleted, 0);
assert_eq!(diff.changes.len(), 1);
assert_eq!(diff.changes[0].kind, ChangeKind::Moved);
}
struct MajorClassifier;
impl SeverityClassifier for MajorClassifier {
fn classify(&self, _change: &SemanticChange) -> ChangeSeverity {
ChangeSeverity::Major
}
}
#[test]
fn move_relabeling_respects_custom_classifier() {
let old = document_with_texts("old", &["Alpha", "Beta", "Gamma"]);
let new = document_with_texts("new", &["Beta", "Alpha", "Gamma"]);
let diff = diff_semantic_documents_with_classifier(
&old,
&new,
DiffConfig::default(),
&MajorClassifier,
);
assert_eq!(diff.changes[0].kind, ChangeKind::Moved);
assert_eq!(diff.changes[0].severity, ChangeSeverity::Major);
}
#[test]
fn detects_layout_change_for_exact_text_match() {
let old = document_with_positioned_text("old", "Alpha", 0.0, 0.0);
let new = document_with_positioned_text("new", "Alpha", 12.0, 0.0);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.layout_changed, 1);
assert_eq!(diff.summary.modified, 0);
assert_eq!(diff.changes[0].kind, ChangeKind::LayoutChanged);
assert_eq!(diff.changes[0].severity, ChangeSeverity::Minor);
}
#[test]
fn ignores_layout_change_inside_tolerance() {
let old = document_with_positioned_text("old", "Alpha", 0.0, 0.0);
let new = document_with_positioned_text("new", "Alpha", 1.0, 0.0);
let diff = diff_semantic_documents(&old, &new, DiffConfig::default());
assert_eq!(diff.summary.layout_changed, 0);
assert_eq!(diff.changes, Vec::new());
}
fn document_with_text(fingerprint: &str, text: &str) -> SemanticDocument {
document_with_texts(fingerprint, &[text])
}
fn document_with_texts(fingerprint: &str, texts: &[&str]) -> SemanticDocument {
document_with_owned_texts(fingerprint, texts.iter().map(|text| (*text).to_owned()))
}
fn document_with_owned_texts(
fingerprint: &str,
texts: impl IntoIterator<Item = String>,
) -> SemanticDocument {
SemanticDocument {
fingerprint: fingerprint.into(),
nodes: texts
.into_iter()
.enumerate()
.map(|(index, text)| SemanticNode {
id: format!("n{index:04}"),
kind: SemanticNodeKind::Paragraph,
page_index: 0,
bbox: None,
normalized_text: Some(text),
anchor: SemanticAnchor::unknown(),
source: vec![Provenance::unknown()],
confidence: 1.0,
})
.collect(),
diagnostics: Vec::new(),
tagged_structure: None,
}
}
fn document_with_positioned_text(
fingerprint: &str,
text: &str,
x: f32,
y: f32,
) -> SemanticDocument {
let mut document = document_with_text(fingerprint, text);
document.nodes[0].bbox = Some(spdfdiff_types::Rect {
x0: x,
y0: y,
x1: x + 10.0,
y1: y + 10.0,
});
document
}
}