words_to_data/diff/mod.rs
1use std::collections::{HashMap, HashSet};
2use std::str::FromStr;
3
4use rayon::prelude::*;
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7use similar::{ChangeTag, TextDiff};
8use time::Date;
9
10use crate::constants::STOP_WORDS;
11use crate::uslm::{BillAmendment, ElementData, TextContentField, USLMElement, bill_parser::Bill};
12
13/// A change detected in a single text content field between two document versions
14///
15/// This struct captures the complete details of a change to one of the five text
16/// content fields (Heading, Chapeau, Proviso, Content, or Continuation) in a
17/// legislative element.
18///
19/// The changes are computed at word-level granularity using a diff algorithm,
20/// allowing precise identification of which words were inserted, deleted, or
21/// remained unchanged.
22#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24pub struct FieldChangeEvent {
25 /// Which text content field changed
26 pub field_name: TextContentField,
27
28 /// The publication date of the original version
29 pub from_date: Date,
30
31 /// The publication date of the new version
32 pub to_date: Date,
33
34 /// The complete original text of the field
35 pub old_value: String,
36
37 /// The complete new text of the field
38 pub new_value: String,
39
40 /// Word-level changes showing insertions, deletions, and unchanged portions
41 pub changes: Vec<TextChange>,
42}
43
44/// A single word-level change within a text field
45///
46/// Represents one unit of change in a diff, typically a word or whitespace token.
47/// Each change has a type (Insert, Delete, or Equal) and position indices in
48/// the old and new text.
49#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
50#[serde(rename_all = "snake_case")]
51pub struct TextChange {
52 /// The text value of this change (a word or whitespace)
53 pub value: String,
54
55 /// Position in the original text (None for insertions)
56 pub old_index: Option<i32>,
57
58 /// Position in the new text (None for deletions)
59 pub new_index: Option<i32>,
60
61 /// The type of change
62 pub tag: TextChangeType,
63}
64
65/// The type of change for a text fragment
66#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
67#[serde(rename_all = "snake_case")]
68pub enum TextChangeType {
69 /// This text was added in the new version
70 Insert,
71 /// This text was removed from the old version
72 Delete,
73 /// This text is unchanged between versions
74 Equal,
75}
76
77/// A hierarchical diff between two versions of a USLM document tree
78///
79/// This struct captures all changes between two versions of the same legislative
80/// element and its children. It mirrors the tree structure of `USLMElement`,
81/// with diffs computed recursively for all matching children.
82///
83/// # Structure
84///
85/// The diff includes:
86/// - **Field changes**: Text modifications to the element's own content fields
87/// - **Added elements**: New child elements in the new version
88/// - **Removed elements**: Child elements that existed in the old version but not the new
89/// - **Child diffs**: Recursive diffs for child elements that exist in both versions
90///
91/// # Examples
92///
93/// ```
94/// use words_to_data::{diff::TreeDiff, uslm::parser::parse};
95///
96/// // Parse two versions of a document
97/// let old_doc = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
98/// let new_doc = parse("tests/test_data/usc/2025-07-30/usc07.xml", "2025-07-30").unwrap();
99///
100/// // Compute the diff
101/// let diff = TreeDiff::from_elements(&old_doc, &new_doc);
102///
103/// // Examine changes
104/// println!("Field changes: {}", diff.changes.len());
105/// println!("Elements added: {}", diff.added.len());
106/// println!("Elements removed: {}", diff.removed.len());
107/// ```
108#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
109#[serde(rename_all = "snake_case")]
110pub struct TreeDiff {
111 /// The structural path of the element being compared
112 pub root_path: String,
113
114 /// Text content field changes for this element
115 pub changes: Vec<FieldChangeEvent>,
116
117 /// Metadata from the original version of this element
118 pub from_element: ElementData,
119
120 /// Metadata from the new version of this element
121 pub to_element: ElementData,
122
123 /// Child elements that were added in the new version
124 pub added: Vec<ElementData>,
125
126 /// Child elements that were removed from the old version
127 pub removed: Vec<ElementData>,
128
129 /// Recursive diffs for child elements present in both versions
130 pub child_diffs: Vec<TreeDiff>,
131}
132
133impl TreeDiff {
134 /// Generate a regex for searching for mentions of an element
135 ///
136 /// Amendments and other legal texts tend to exclude chapters and titles when discussing legal references
137 /// instead, they have a tendency to directly state references, and pieces of them.
138 /// e.g.
139 ///
140 /// "According to Section 174 (a)(2)(A)"
141 ///
142 /// This function will generate compatible regexes for relevant strucutural elements to match those.
143 pub fn mention_regex(&self) -> Option<Regex> {
144 if self.root_path.contains("section") {
145 let mut mreg = String::from(self.section_regex().unwrap().as_str());
146 // Remove \D matcher from section regex
147 mreg.truncate(mreg.len() - 2);
148 let split: Vec<_> = self.root_path.split("/").collect();
149 let mut started = false;
150 for part in split {
151 // Skip parts without underscore (like "uscode")
152 let Some((part_name, part_num)) = part.split_once("_") else {
153 continue;
154 };
155 if started {
156 mreg += r"\(";
157 mreg += part_num;
158 mreg += r"\)\s*"
159 }
160 if part_name == "section" {
161 started = true;
162 }
163 }
164 Some(Regex::from_str(mreg.as_str()).unwrap())
165 } else {
166 None
167 }
168 }
169
170 pub fn section_regex(&self) -> Option<Regex> {
171 if self.root_path.contains("section") {
172 let mut regex = String::from(r"[Ss]ection\s*");
173 let split: Vec<_> = self.root_path.split("/").collect();
174 for part in split {
175 // Skip parts without underscore (like "uscode")
176 let Some((part_name, part_num)) = part.split_once("_") else {
177 continue;
178 };
179 if part_name == "section" {
180 regex += part_num;
181 regex += r"\D";
182 return Some(Regex::from_str(regex.as_str()).unwrap());
183 }
184 }
185 }
186 None
187 }
188
189 /// Generates a list of all candidate regexes for a TreeDiff
190 pub fn all_regexes(&self) -> Vec<Regex> {
191 let mut res = Vec::new();
192 if let Some(sreg) = self.section_regex() {
193 res.push(sreg.clone());
194 if let Some(mreg) = self.mention_regex()
195 && mreg.as_str() != sreg.as_str()
196 {
197 res.push(mreg);
198 }
199 }
200
201 res
202 }
203
204 // fn all_regexes_rec(&self, visited: &mut HashSet<String>) -> Vec<Regex> {
205 // let mut regs: Vec<Regex> = Vec::new();
206 // if let Some(reg) = self.mention_regex() {
207 // let reg_str = reg.to_string();
208 // if !visited.contains(®_str) {
209 // visited.insert(reg_str);
210 // regs.push(reg);
211 // }
212 // }
213 // if let Some(reg) = self.section_regex() {
214 // let reg_str = reg.to_string();
215 // if !visited.contains(®_str) {
216 // visited.insert(reg_str);
217 // regs.push(reg);
218 // }
219 // }
220 // for child in self.child_diffs.iter() {
221 // let mut child_regs = child.all_regexes_rec(visited);
222 // regs.append(&mut child_regs);
223 // }
224 // regs
225 // }
226
227 /// Compute the diff between two USLM element trees
228 ///
229 /// Compares two versions of the same legislative element and computes all
230 /// changes at both the current level and recursively through all children.
231 ///
232 /// # Arguments
233 ///
234 /// * `from_element` - The original (older) version of the element
235 /// * `to_element` - The new (newer) version of the element
236 ///
237 /// # Panics
238 ///
239 /// Panics if the two elements don't have the same structural path, as they
240 /// must represent the same logical element in different versions.
241 ///
242 /// # Returns
243 ///
244 /// A `TreeDiff` containing all detected changes between the two versions.
245 ///
246 /// # Examples
247 ///
248 /// ```
249 /// # use words_to_data::{diff::TreeDiff, uslm::parser::parse};
250 /// let old = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
251 /// let new = parse("tests/test_data/usc/2025-07-30/usc07.xml", "2025-07-30").unwrap();
252 ///
253 /// let diff = TreeDiff::from_elements(&old, &new);
254 /// assert_eq!(diff.root_path, old.data.path);
255 /// ```
256 pub fn from_elements(from_element: &USLMElement, to_element: &USLMElement) -> TreeDiff {
257 assert!(from_element.data.path == to_element.data.path);
258 let root_path = from_element.data.path.clone();
259 // 1. Diff the root element's fields
260 let changes = diff_elements(from_element, to_element);
261
262 // 2. Build HashMaps of children by path
263 let children_a: HashMap<String, &USLMElement> = from_element
264 .children
265 .iter()
266 .map(|child| (child.data.path.clone(), child))
267 .collect();
268 let children_b: HashMap<String, &USLMElement> = to_element
269 .children
270 .iter()
271 .map(|child| (child.data.path.clone(), child))
272 .collect();
273
274 // 3. Find added, removed, matched
275 let mut added = vec![];
276 let mut removed = vec![];
277 let mut child_diffs = vec![];
278 // Iterate once through A - handle matched and removed
279 for (path, child_a) in &children_a {
280 match children_b.get(path) {
281 Some(child_b) => {
282 // Matched - recurse
283 let child_diff = TreeDiff::from_elements(child_a, child_b);
284 if !child_diff.child_diffs.is_empty() || !child_diff.changes.is_empty() {
285 child_diffs.push(child_diff);
286 }
287 }
288 None => {
289 // Removed
290 removed.push(child_a.data.clone()); //ElementSnapshot::from(child_a));
291 }
292 }
293 }
294
295 // Iterate through B for added only
296 for (path, child_b) in &children_b {
297 if !children_a.contains_key(path) {
298 added.push(child_b.data.clone()); //ElementSnapshot::from(child_b));
299 }
300 }
301
302 TreeDiff {
303 changes,
304 root_path,
305 from_element: from_element.data.clone(),
306 to_element: to_element.data.clone(),
307 added,
308 removed,
309 child_diffs,
310 }
311 }
312
313 /// Search for a diff by its structural path
314 ///
315 /// Recursively searches this element and all descendants for an element
316 /// with the specified path. The path must be a fully qualified structural
317 /// path (e.g., "uscode/title_7/chapter_1/section_1").
318 ///
319 /// # Arguments
320 ///
321 /// * `path` - The full structural path of the element to find
322 ///
323 /// # Returns
324 ///
325 /// Returns `Some(&TreeDiff)` if an element with the matching path is found,
326 /// or `None` if no such element exists in this tree.
327 pub fn find(&self, path: &str) -> Option<&TreeDiff> {
328 if path == self.root_path.as_str() {
329 return Some(self);
330 }
331 let remaining_path = path.strip_prefix(self.root_path.as_str())?;
332 let next_step: Vec<&str> = remaining_path.split("/").collect();
333 assert!(next_step.len() > 1);
334
335 let child_id = next_step[1];
336 let child_vec: Vec<&TreeDiff> = self
337 .child_diffs
338 .iter()
339 .filter(|c| c.root_path.ends_with(child_id))
340 .collect();
341 if child_vec.is_empty() {
342 None
343 } else {
344 assert!(child_vec.len() == 1);
345 child_vec[0].find(path)
346 }
347 }
348
349 /// Calculate the similarity of diffs in the TreeDiff with the amendment data from a bill
350 ///
351 /// Returns a hashmap with the key being the root_path in the tree diff and the value
352 /// being the similarity data
353 pub fn calculate_amendment_similarities(
354 &self,
355 data: &Bill,
356 ) -> HashMap<String, AmendmentSimilarity> {
357 let mut result = HashMap::new();
358 self.calculate_similarities_recursive(&mut result, data);
359 result
360 }
361
362 fn calculate_similarities_recursive(
363 &self,
364 result: &mut HashMap<String, AmendmentSimilarity>,
365 data: &Bill,
366 ) {
367 // Check if this TreeDiff has any changes
368 if !self.changes.is_empty() {
369 // Find the best matching amendment
370 for (amendment_id, amendment) in &data.amendments {
371 if amendment.changes.is_empty() {
372 continue;
373 }
374
375 let similarity = self.calculate_match_with_amendment(amendment_id, amendment);
376
377 if similarity.score > 0.0 {
378 // Insert or update if this is a better match
379 let entry = result
380 .entry(self.root_path.clone())
381 .or_insert(similarity.clone());
382
383 if similarity.score > entry.score {
384 *entry = similarity;
385 }
386 }
387 }
388 }
389
390 // Recurse into children
391 for child_diff in &self.child_diffs {
392 child_diff.calculate_similarities_recursive(result, data);
393 }
394 }
395
396 fn calculate_match_with_amendment(
397 &self,
398 amendment_id: &str,
399 amendment: &BillAmendment,
400 ) -> AmendmentSimilarity {
401 // Collect all changed words from this TreeDiff (deletions + insertions)
402 let tree_diff_words: HashSet<String> = self.collect_tree_diff_words();
403 let tree_diff_count = tree_diff_words.len();
404
405 // Find the best-matching BillDiff within this amendment
406 let mut best_score = 0.0_f32;
407 let mut best_precision = 0.0_f32;
408 let mut best_recall = 0.0_f32;
409 let mut best_matched = 0_i32;
410
411 for bill_diff in &amendment.changes {
412 // Collect words from this specific BillDiff
413 let mut bill_diff_words: HashSet<String> = HashSet::new();
414 for word in &bill_diff.removed {
415 let trimmed = word.trim();
416 if !trimmed.is_empty() && !is_stop_word(trimmed) {
417 bill_diff_words.insert(trimmed.to_lowercase());
418 }
419 }
420 for word in &bill_diff.added {
421 let trimmed = word.trim();
422 if !trimmed.is_empty() && !is_stop_word(trimmed) {
423 bill_diff_words.insert(trimmed.to_lowercase());
424 }
425 }
426
427 if bill_diff_words.is_empty() {
428 continue;
429 }
430
431 // Calculate intersection with this BillDiff
432 let matched_words: i32 = tree_diff_words
433 .iter()
434 .filter(|w| bill_diff_words.contains(*w))
435 .count() as i32;
436
437 let bill_diff_count = bill_diff_words.len();
438
439 // Calculate precision: how well this BillDiff explains TreeDiff
440 let precision = if tree_diff_count > 0 {
441 matched_words as f32 / tree_diff_count as f32
442 } else {
443 0.0
444 };
445
446 // Calculate recall: how much of this BillDiff is in TreeDiff
447 let recall = if bill_diff_count > 0 {
448 matched_words as f32 / bill_diff_count as f32
449 } else {
450 0.0
451 };
452
453 // Calculate F1 score for this BillDiff
454 let score = if precision + recall > 0.0 {
455 2.0 * precision * recall / (precision + recall)
456 } else {
457 0.0
458 };
459
460 // Keep the best match
461 if score > best_score {
462 best_score = score;
463 best_precision = precision;
464 best_recall = recall;
465 best_matched = matched_words;
466 }
467 }
468
469 AmendmentSimilarity {
470 tree_diff_path: self.root_path.clone(),
471 amendment_id: amendment_id.to_string(),
472 score: best_score,
473 precision: best_precision,
474 recall: best_recall,
475 matched_words: best_matched,
476 tree_diff_words: tree_diff_count as i32,
477 }
478 }
479
480 /// Collect all significant changed words from this TreeDiff
481 fn collect_tree_diff_words(&self) -> HashSet<String> {
482 let mut words = HashSet::new();
483 for field_change in &self.changes {
484 for text_change in &field_change.changes {
485 let word = text_change.value.trim();
486 // Skip empty strings and stop words (case-insensitive)
487 if word.is_empty() || is_stop_word(word) {
488 continue;
489 }
490 match text_change.tag {
491 TextChangeType::Delete | TextChangeType::Insert => {
492 words.insert(word.to_lowercase());
493 }
494 TextChangeType::Equal => {}
495 }
496 }
497 }
498 words
499 }
500
501 /// Scan all amendment texts for mentions of changed sections.
502 ///
503 /// Uses the regexes from `all_regexes()` to find section mentions in each
504 /// amendment's `amending_text`. This helps identify which amendments might
505 /// be responsible for changes at specific structural paths.
506 ///
507 /// # Arguments
508 ///
509 /// * `data` - Bill data from a parsed bill
510 ///
511 /// # Returns
512 ///
513 /// A map from amendment_id to list of matches found in that amendment's text.
514 /// For each tree_diff_path, only the most specific (longest) match is kept.
515 pub fn scan_for_mentions(&self, data: &Bill) -> HashMap<String, Vec<MentionMatch>> {
516 // Collect all regexes with their source paths recursively
517 let regex_with_paths = self.collect_regexes_with_paths();
518
519 // Scan each amendment's text against all regexes
520 let mut results: HashMap<String, Vec<MentionMatch>> = HashMap::new();
521
522 for (amendment_id, amendment) in &data.amendments {
523 let text = &amendment.amending_text;
524 let all_matches: Vec<MentionMatch> = regex_with_paths
525 .par_iter()
526 .filter_map(|(path, reg)| {
527 reg.find(text).map(|mat| MentionMatch {
528 tree_diff_path: path.clone(),
529 matched_text: mat.as_str().to_string(),
530 })
531 })
532 .collect();
533
534 // Deduplicate: keep only the longest match per tree_diff_path
535 let mut best_by_path: HashMap<&str, &MentionMatch> = HashMap::new();
536 for m in &all_matches {
537 let dominated = best_by_path
538 .get(m.tree_diff_path.as_str())
539 .is_some_and(|existing| existing.matched_text.len() >= m.matched_text.len());
540 if !dominated {
541 best_by_path.insert(&m.tree_diff_path, m);
542 }
543 }
544 let matches: Vec<MentionMatch> = best_by_path.into_values().cloned().collect();
545
546 if !matches.is_empty() {
547 results.insert(amendment_id.clone(), matches);
548 }
549 }
550
551 results
552 }
553
554 /// Return a shallow copy of this TreeDiff without children.
555 ///
556 /// Useful when correlating a specific diff node with other data
557 /// without needing the full subtree.
558 pub fn shallow(&self) -> TreeDiff {
559 TreeDiff {
560 root_path: self.root_path.clone(),
561 changes: self.changes.clone(),
562 from_element: self.from_element.clone(),
563 to_element: self.to_element.clone(),
564 added: self.added.clone(),
565 removed: self.removed.clone(),
566 child_diffs: vec![],
567 }
568 }
569
570 /// Recursively collect regexes with their source paths from this TreeDiff.
571 fn collect_regexes_with_paths(&self) -> Vec<(String, Regex)> {
572 let mut result = Vec::new();
573
574 // Add regexes from this node
575 for reg in self.all_regexes() {
576 result.push((self.root_path.clone(), reg));
577 }
578
579 // Recurse into children
580 for child in &self.child_diffs {
581 result.extend(child.collect_regexes_with_paths());
582 }
583
584 result
585 }
586}
587
588/// Similarity between a TreeDiff and a bill amendment
589///
590/// Used to rank how likely a BillAmendment caused the changes at a TreeDiff location.
591#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
592pub struct AmendmentSimilarity {
593 /// The structural path of the TreeDiff node
594 pub tree_diff_path: String,
595 /// The ID of the matched BillAmendment
596 pub amendment_id: String,
597 /// Primary ranking metric (precision-weighted F1)
598 pub score: f32,
599 /// How well the amendment explains the TreeDiff's changes
600 /// |TreeDiff ∩ Amendment| / |TreeDiff|
601 pub precision: f32,
602 /// How much of the amendment is represented in this TreeDiff
603 /// |TreeDiff ∩ Amendment| / |Amendment|
604 pub recall: f32,
605 /// Number of words that matched between TreeDiff and Amendment
606 pub matched_words: i32,
607 /// Total significant words in the TreeDiff's changes
608 pub tree_diff_words: i32,
609}
610
611/// A match found when scanning amendment text for section mentions.
612///
613/// When scanning bill amendments against a TreeDiff's regexes, this struct
614/// captures each match, linking the structural path from the TreeDiff to
615/// the text that matched in the amendment.
616#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
617pub struct MentionMatch {
618 /// The structural path from the TreeDiff that generated this match
619 pub tree_diff_path: String,
620 /// The text that matched the regex pattern
621 pub matched_text: String,
622}
623
624/// Check if a word is a stop word (case-insensitive)
625fn is_stop_word(word: &str) -> bool {
626 let lower = word.to_lowercase();
627 STOP_WORDS.contains(&lower.as_str())
628}
629
630/// Compute field-level changes between two elements
631///
632/// Compares all five text content fields (Heading, Chapeau, Proviso, Content,
633/// Continuation) between two versions of the same element and returns change
634/// events for any fields that differ.
635///
636/// # Arguments
637///
638/// * `element_a` - The original version of the element
639/// * `element_b` - The new version of the element
640///
641/// # Returns
642///
643/// A vector of `FieldChangeEvent` for each field that has changes.
644/// Fields that are identical in both versions are omitted.
645///
646/// # Panics
647///
648/// Panics if the elements have different paths or types.
649pub fn diff_elements(element_a: &USLMElement, element_b: &USLMElement) -> Vec<FieldChangeEvent> {
650 assert!(element_a.data.path == element_b.data.path);
651 assert!(element_a.data.element_type == element_b.data.element_type);
652 let mut changes: Vec<FieldChangeEvent> = Vec::new();
653 for field_name in [
654 TextContentField::Heading,
655 TextContentField::Chapeau,
656 TextContentField::Proviso,
657 TextContentField::Content,
658 TextContentField::Continuation,
659 ]
660 .into_iter()
661 {
662 let field_changes = diff_field(element_a, element_b, field_name);
663 // Only include if there are actual changes (Insert or Delete), not just Equal
664 let has_real_changes = field_changes
665 .changes
666 .iter()
667 .any(|c| c.tag != TextChangeType::Equal);
668 if has_real_changes {
669 changes.push(field_changes);
670 }
671 }
672 changes
673}
674
675// databases don't like usizes, make it an i32
676// text content will never exceed the i32 range
677fn rewrap_usize(s: Option<usize>) -> Option<i32> {
678 s.map(|val| val as i32)
679}
680
681fn diff_field(
682 element_a: &USLMElement,
683 element_b: &USLMElement,
684 field_name: TextContentField,
685) -> FieldChangeEvent {
686 let a = element_a
687 .data
688 .get_text_content(field_name)
689 .unwrap_or_default();
690 let b = element_b
691 .data
692 .get_text_content(field_name)
693 .unwrap_or_default();
694
695 let diff = TextDiff::from_words(a.as_str(), b.as_str());
696 let changes: Vec<TextChange> = diff
697 .iter_all_changes()
698 // Keep all changes including Equal for word-level diff rendering
699 .map(|c| {
700 let tag = match c.tag() {
701 ChangeTag::Delete => TextChangeType::Delete,
702 ChangeTag::Insert => TextChangeType::Insert,
703 ChangeTag::Equal => TextChangeType::Equal,
704 };
705 TextChange {
706 value: String::from(c.value()),
707 old_index: rewrap_usize(c.old_index()),
708 new_index: rewrap_usize(c.new_index()),
709 tag,
710 }
711 })
712 .collect();
713 FieldChangeEvent {
714 field_name,
715 from_date: element_a.data.date,
716 to_date: element_b.data.date,
717 old_value: a,
718 new_value: b,
719 changes,
720 }
721}