Skip to main content

words_to_data/diff/
mod.rs

1use std::collections::{HashMap, HashSet};
2use std::str::FromStr;
3
4use rayon::prelude::*;
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7use similar::{ChangeTag, TextDiff};
8use time::Date;
9
10use crate::constants::STOP_WORDS;
11use crate::uslm::{BillAmendment, ElementData, TextContentField, USLMElement, bill_parser::Bill};
12
13/// A change detected in a single text content field between two document versions
14///
15/// This struct captures the complete details of a change to one of the five text
16/// content fields (Heading, Chapeau, Proviso, Content, or Continuation) in a
17/// legislative element.
18///
19/// The changes are computed at word-level granularity using a diff algorithm,
20/// allowing precise identification of which words were inserted, deleted, or
21/// remained unchanged.
22#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24pub struct FieldChangeEvent {
25    /// Which text content field changed
26    pub field_name: TextContentField,
27
28    /// The publication date of the original version
29    pub from_date: Date,
30
31    /// The publication date of the new version
32    pub to_date: Date,
33
34    /// The complete original text of the field
35    pub old_value: String,
36
37    /// The complete new text of the field
38    pub new_value: String,
39
40    /// Word-level changes showing insertions, deletions, and unchanged portions
41    pub changes: Vec<TextChange>,
42}
43
44/// A single word-level change within a text field
45///
46/// Represents one unit of change in a diff, typically a word or whitespace token.
47/// Each change has a type (Insert, Delete, or Equal) and position indices in
48/// the old and new text.
49#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
50#[serde(rename_all = "snake_case")]
51pub struct TextChange {
52    /// The text value of this change (a word or whitespace)
53    pub value: String,
54
55    /// Position in the original text (None for insertions)
56    pub old_index: Option<i32>,
57
58    /// Position in the new text (None for deletions)
59    pub new_index: Option<i32>,
60
61    /// The type of change
62    pub tag: TextChangeType,
63}
64
65/// The type of change for a text fragment
66#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
67#[serde(rename_all = "snake_case")]
68pub enum TextChangeType {
69    /// This text was added in the new version
70    Insert,
71    /// This text was removed from the old version
72    Delete,
73    /// This text is unchanged between versions
74    Equal,
75}
76
77/// A hierarchical diff between two versions of a USLM document tree
78///
79/// This struct captures all changes between two versions of the same legislative
80/// element and its children. It mirrors the tree structure of `USLMElement`,
81/// with diffs computed recursively for all matching children.
82///
83/// # Structure
84///
85/// The diff includes:
86/// - **Field changes**: Text modifications to the element's own content fields
87/// - **Added elements**: New child elements in the new version
88/// - **Removed elements**: Child elements that existed in the old version but not the new
89/// - **Child diffs**: Recursive diffs for child elements that exist in both versions
90///
91/// # Examples
92///
93/// ```
94/// use words_to_data::{diff::TreeDiff, uslm::parser::parse};
95///
96/// // Parse two versions of a document
97/// let old_doc = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
98/// let new_doc = parse("tests/test_data/usc/2025-07-30/usc07.xml", "2025-07-30").unwrap();
99///
100/// // Compute the diff
101/// let diff = TreeDiff::from_elements(&old_doc, &new_doc);
102///
103/// // Examine changes
104/// println!("Field changes: {}", diff.changes.len());
105/// println!("Elements added: {}", diff.added.len());
106/// println!("Elements removed: {}", diff.removed.len());
107/// ```
108#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
109#[serde(rename_all = "snake_case")]
110pub struct TreeDiff {
111    /// The structural path of the element being compared
112    pub root_path: String,
113
114    /// Text content field changes for this element
115    pub changes: Vec<FieldChangeEvent>,
116
117    /// Metadata from the original version of this element
118    pub from_element: ElementData,
119
120    /// Metadata from the new version of this element
121    pub to_element: ElementData,
122
123    /// Child elements that were added in the new version
124    pub added: Vec<ElementData>,
125
126    /// Child elements that were removed from the old version
127    pub removed: Vec<ElementData>,
128
129    /// Recursive diffs for child elements present in both versions
130    pub child_diffs: Vec<TreeDiff>,
131}
132
133impl TreeDiff {
134    /// Generate a regex for searching for mentions of an element
135    ///
136    /// Amendments and other legal texts tend to exclude chapters and titles when discussing legal references
137    /// instead, they have a tendency to directly state references, and pieces of them.
138    /// e.g.
139    ///
140    /// "According to Section 174 (a)(2)(A)"
141    ///
142    /// This function will generate compatible regexes for relevant strucutural elements to match those.
143    pub fn mention_regex(&self) -> Option<Regex> {
144        if self.root_path.contains("section") {
145            let mut mreg = String::from(self.section_regex().unwrap().as_str());
146            // Remove \D matcher from section regex
147            mreg.truncate(mreg.len() - 2);
148            let split: Vec<_> = self.root_path.split("/").collect();
149            let mut started = false;
150            for part in split {
151                // Skip parts without underscore (like "uscode")
152                let Some((part_name, part_num)) = part.split_once("_") else {
153                    continue;
154                };
155                if started {
156                    mreg += r"\(";
157                    mreg += part_num;
158                    mreg += r"\)\s*"
159                }
160                if part_name == "section" {
161                    started = true;
162                }
163            }
164            Some(Regex::from_str(mreg.as_str()).unwrap())
165        } else {
166            None
167        }
168    }
169
170    pub fn section_regex(&self) -> Option<Regex> {
171        if self.root_path.contains("section") {
172            let mut regex = String::from(r"[Ss]ection\s*");
173            let split: Vec<_> = self.root_path.split("/").collect();
174            for part in split {
175                // Skip parts without underscore (like "uscode")
176                let Some((part_name, part_num)) = part.split_once("_") else {
177                    continue;
178                };
179                if part_name == "section" {
180                    regex += part_num;
181                    regex += r"\D";
182                    return Some(Regex::from_str(regex.as_str()).unwrap());
183                }
184            }
185        }
186        None
187    }
188
189    /// Generates a list of all candidate regexes for a TreeDiff
190    pub fn all_regexes(&self) -> Vec<Regex> {
191        let mut res = Vec::new();
192        if let Some(sreg) = self.section_regex() {
193            res.push(sreg.clone());
194            if let Some(mreg) = self.mention_regex()
195                && mreg.as_str() != sreg.as_str()
196            {
197                res.push(mreg);
198            }
199        }
200
201        res
202    }
203
204    // fn all_regexes_rec(&self, visited: &mut HashSet<String>) -> Vec<Regex> {
205    //     let mut regs: Vec<Regex> = Vec::new();
206    //     if let Some(reg) = self.mention_regex() {
207    //         let reg_str = reg.to_string();
208    //         if !visited.contains(&reg_str) {
209    //             visited.insert(reg_str);
210    //             regs.push(reg);
211    //         }
212    //     }
213    //     if let Some(reg) = self.section_regex() {
214    //         let reg_str = reg.to_string();
215    //         if !visited.contains(&reg_str) {
216    //             visited.insert(reg_str);
217    //             regs.push(reg);
218    //         }
219    //     }
220    //     for child in self.child_diffs.iter() {
221    //         let mut child_regs = child.all_regexes_rec(visited);
222    //         regs.append(&mut child_regs);
223    //     }
224    //     regs
225    // }
226
227    /// Compute the diff between two USLM element trees
228    ///
229    /// Compares two versions of the same legislative element and computes all
230    /// changes at both the current level and recursively through all children.
231    ///
232    /// # Arguments
233    ///
234    /// * `from_element` - The original (older) version of the element
235    /// * `to_element` - The new (newer) version of the element
236    ///
237    /// # Panics
238    ///
239    /// Panics if the two elements don't have the same structural path, as they
240    /// must represent the same logical element in different versions.
241    ///
242    /// # Returns
243    ///
244    /// A `TreeDiff` containing all detected changes between the two versions.
245    ///
246    /// # Examples
247    ///
248    /// ```
249    /// # use words_to_data::{diff::TreeDiff, uslm::parser::parse};
250    /// let old = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
251    /// let new = parse("tests/test_data/usc/2025-07-30/usc07.xml", "2025-07-30").unwrap();
252    ///
253    /// let diff = TreeDiff::from_elements(&old, &new);
254    /// assert_eq!(diff.root_path, old.data.path);
255    /// ```
256    pub fn from_elements(from_element: &USLMElement, to_element: &USLMElement) -> TreeDiff {
257        assert!(from_element.data.path == to_element.data.path);
258        let root_path = from_element.data.path.clone();
259        // 1. Diff the root element's fields
260        let changes = diff_elements(from_element, to_element);
261
262        // 2. Build HashMaps of children by path
263        let children_a: HashMap<String, &USLMElement> = from_element
264            .children
265            .iter()
266            .map(|child| (child.data.path.clone(), child))
267            .collect();
268        let children_b: HashMap<String, &USLMElement> = to_element
269            .children
270            .iter()
271            .map(|child| (child.data.path.clone(), child))
272            .collect();
273
274        // 3. Find added, removed, matched
275        let mut added = vec![];
276        let mut removed = vec![];
277        let mut child_diffs = vec![];
278        // Iterate once through A - handle matched and removed
279        for (path, child_a) in &children_a {
280            match children_b.get(path) {
281                Some(child_b) => {
282                    // Matched - recurse
283                    let child_diff = TreeDiff::from_elements(child_a, child_b);
284                    if !child_diff.child_diffs.is_empty() || !child_diff.changes.is_empty() {
285                        child_diffs.push(child_diff);
286                    }
287                }
288                None => {
289                    // Removed
290                    removed.push(child_a.data.clone()); //ElementSnapshot::from(child_a));
291                }
292            }
293        }
294
295        // Iterate through B for added only
296        for (path, child_b) in &children_b {
297            if !children_a.contains_key(path) {
298                added.push(child_b.data.clone()); //ElementSnapshot::from(child_b));
299            }
300        }
301
302        TreeDiff {
303            changes,
304            root_path,
305            from_element: from_element.data.clone(),
306            to_element: to_element.data.clone(),
307            added,
308            removed,
309            child_diffs,
310        }
311    }
312
313    /// Search for a diff by its structural path
314    ///
315    /// Recursively searches this element and all descendants for an element
316    /// with the specified path. The path must be a fully qualified structural
317    /// path (e.g., "uscode/title_7/chapter_1/section_1").
318    ///
319    /// # Arguments
320    ///
321    /// * `path` - The full structural path of the element to find
322    ///
323    /// # Returns
324    ///
325    /// Returns `Some(&TreeDiff)` if an element with the matching path is found,
326    /// or `None` if no such element exists in this tree.
327    pub fn find(&self, path: &str) -> Option<&TreeDiff> {
328        if path == self.root_path.as_str() {
329            return Some(self);
330        }
331        let remaining_path = path.strip_prefix(self.root_path.as_str())?;
332        let next_step: Vec<&str> = remaining_path.split("/").collect();
333        assert!(next_step.len() > 1);
334
335        let child_id = next_step[1];
336        let child_vec: Vec<&TreeDiff> = self
337            .child_diffs
338            .iter()
339            .filter(|c| c.root_path.ends_with(child_id))
340            .collect();
341        if child_vec.is_empty() {
342            None
343        } else {
344            assert!(child_vec.len() == 1);
345            child_vec[0].find(path)
346        }
347    }
348
349    /// Calculate the similarity of diffs in the TreeDiff with the amendment data from a bill
350    ///
351    /// Returns a hashmap with the key being the root_path in the tree diff and the value
352    /// being the similarity data
353    pub fn calculate_amendment_similarities(
354        &self,
355        data: &Bill,
356    ) -> HashMap<String, AmendmentSimilarity> {
357        let mut result = HashMap::new();
358        self.calculate_similarities_recursive(&mut result, data);
359        result
360    }
361
362    fn calculate_similarities_recursive(
363        &self,
364        result: &mut HashMap<String, AmendmentSimilarity>,
365        data: &Bill,
366    ) {
367        // Check if this TreeDiff has any changes
368        if !self.changes.is_empty() {
369            // Find the best matching amendment
370            for (amendment_id, amendment) in &data.amendments {
371                if amendment.changes.is_empty() {
372                    continue;
373                }
374
375                let similarity = self.calculate_match_with_amendment(amendment_id, amendment);
376
377                if similarity.score > 0.0 {
378                    // Insert or update if this is a better match
379                    let entry = result
380                        .entry(self.root_path.clone())
381                        .or_insert(similarity.clone());
382
383                    if similarity.score > entry.score {
384                        *entry = similarity;
385                    }
386                }
387            }
388        }
389
390        // Recurse into children
391        for child_diff in &self.child_diffs {
392            child_diff.calculate_similarities_recursive(result, data);
393        }
394    }
395
396    fn calculate_match_with_amendment(
397        &self,
398        amendment_id: &str,
399        amendment: &BillAmendment,
400    ) -> AmendmentSimilarity {
401        // Collect all changed words from this TreeDiff (deletions + insertions)
402        let tree_diff_words: HashSet<String> = self.collect_tree_diff_words();
403        let tree_diff_count = tree_diff_words.len();
404
405        // Find the best-matching BillDiff within this amendment
406        let mut best_score = 0.0_f32;
407        let mut best_precision = 0.0_f32;
408        let mut best_recall = 0.0_f32;
409        let mut best_matched = 0_i32;
410
411        for bill_diff in &amendment.changes {
412            // Collect words from this specific BillDiff
413            let mut bill_diff_words: HashSet<String> = HashSet::new();
414            for word in &bill_diff.removed {
415                let trimmed = word.trim();
416                if !trimmed.is_empty() && !is_stop_word(trimmed) {
417                    bill_diff_words.insert(trimmed.to_lowercase());
418                }
419            }
420            for word in &bill_diff.added {
421                let trimmed = word.trim();
422                if !trimmed.is_empty() && !is_stop_word(trimmed) {
423                    bill_diff_words.insert(trimmed.to_lowercase());
424                }
425            }
426
427            if bill_diff_words.is_empty() {
428                continue;
429            }
430
431            // Calculate intersection with this BillDiff
432            let matched_words: i32 = tree_diff_words
433                .iter()
434                .filter(|w| bill_diff_words.contains(*w))
435                .count() as i32;
436
437            let bill_diff_count = bill_diff_words.len();
438
439            // Calculate precision: how well this BillDiff explains TreeDiff
440            let precision = if tree_diff_count > 0 {
441                matched_words as f32 / tree_diff_count as f32
442            } else {
443                0.0
444            };
445
446            // Calculate recall: how much of this BillDiff is in TreeDiff
447            let recall = if bill_diff_count > 0 {
448                matched_words as f32 / bill_diff_count as f32
449            } else {
450                0.0
451            };
452
453            // Calculate F1 score for this BillDiff
454            let score = if precision + recall > 0.0 {
455                2.0 * precision * recall / (precision + recall)
456            } else {
457                0.0
458            };
459
460            // Keep the best match
461            if score > best_score {
462                best_score = score;
463                best_precision = precision;
464                best_recall = recall;
465                best_matched = matched_words;
466            }
467        }
468
469        AmendmentSimilarity {
470            tree_diff_path: self.root_path.clone(),
471            amendment_id: amendment_id.to_string(),
472            score: best_score,
473            precision: best_precision,
474            recall: best_recall,
475            matched_words: best_matched,
476            tree_diff_words: tree_diff_count as i32,
477        }
478    }
479
480    /// Collect all significant changed words from this TreeDiff
481    fn collect_tree_diff_words(&self) -> HashSet<String> {
482        let mut words = HashSet::new();
483        for field_change in &self.changes {
484            for text_change in &field_change.changes {
485                let word = text_change.value.trim();
486                // Skip empty strings and stop words (case-insensitive)
487                if word.is_empty() || is_stop_word(word) {
488                    continue;
489                }
490                match text_change.tag {
491                    TextChangeType::Delete | TextChangeType::Insert => {
492                        words.insert(word.to_lowercase());
493                    }
494                    TextChangeType::Equal => {}
495                }
496            }
497        }
498        words
499    }
500
501    /// Scan all amendment texts for mentions of changed sections.
502    ///
503    /// Uses the regexes from `all_regexes()` to find section mentions in each
504    /// amendment's `amending_text`. This helps identify which amendments might
505    /// be responsible for changes at specific structural paths.
506    ///
507    /// # Arguments
508    ///
509    /// * `data` - Bill data from a parsed bill
510    ///
511    /// # Returns
512    ///
513    /// A map from amendment_id to list of matches found in that amendment's text.
514    /// For each tree_diff_path, only the most specific (longest) match is kept.
515    pub fn scan_for_mentions(&self, data: &Bill) -> HashMap<String, Vec<MentionMatch>> {
516        // Collect all regexes with their source paths recursively
517        let regex_with_paths = self.collect_regexes_with_paths();
518
519        // Scan each amendment's text against all regexes
520        let mut results: HashMap<String, Vec<MentionMatch>> = HashMap::new();
521
522        for (amendment_id, amendment) in &data.amendments {
523            let text = &amendment.amending_text;
524            let all_matches: Vec<MentionMatch> = regex_with_paths
525                .par_iter()
526                .filter_map(|(path, reg)| {
527                    reg.find(text).map(|mat| MentionMatch {
528                        tree_diff_path: path.clone(),
529                        matched_text: mat.as_str().to_string(),
530                    })
531                })
532                .collect();
533
534            // Deduplicate: keep only the longest match per tree_diff_path
535            let mut best_by_path: HashMap<&str, &MentionMatch> = HashMap::new();
536            for m in &all_matches {
537                let dominated = best_by_path
538                    .get(m.tree_diff_path.as_str())
539                    .is_some_and(|existing| existing.matched_text.len() >= m.matched_text.len());
540                if !dominated {
541                    best_by_path.insert(&m.tree_diff_path, m);
542                }
543            }
544            let matches: Vec<MentionMatch> = best_by_path.into_values().cloned().collect();
545
546            if !matches.is_empty() {
547                results.insert(amendment_id.clone(), matches);
548            }
549        }
550
551        results
552    }
553
554    /// Return a shallow copy of this TreeDiff without children.
555    ///
556    /// Useful when correlating a specific diff node with other data
557    /// without needing the full subtree.
558    pub fn shallow(&self) -> TreeDiff {
559        TreeDiff {
560            root_path: self.root_path.clone(),
561            changes: self.changes.clone(),
562            from_element: self.from_element.clone(),
563            to_element: self.to_element.clone(),
564            added: self.added.clone(),
565            removed: self.removed.clone(),
566            child_diffs: vec![],
567        }
568    }
569
570    /// Recursively collect regexes with their source paths from this TreeDiff.
571    fn collect_regexes_with_paths(&self) -> Vec<(String, Regex)> {
572        let mut result = Vec::new();
573
574        // Add regexes from this node
575        for reg in self.all_regexes() {
576            result.push((self.root_path.clone(), reg));
577        }
578
579        // Recurse into children
580        for child in &self.child_diffs {
581            result.extend(child.collect_regexes_with_paths());
582        }
583
584        result
585    }
586}
587
588/// Similarity between a TreeDiff and a bill amendment
589///
590/// Used to rank how likely a BillAmendment caused the changes at a TreeDiff location.
591#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
592pub struct AmendmentSimilarity {
593    /// The structural path of the TreeDiff node
594    pub tree_diff_path: String,
595    /// The ID of the matched BillAmendment
596    pub amendment_id: String,
597    /// Primary ranking metric (precision-weighted F1)
598    pub score: f32,
599    /// How well the amendment explains the TreeDiff's changes
600    /// |TreeDiff ∩ Amendment| / |TreeDiff|
601    pub precision: f32,
602    /// How much of the amendment is represented in this TreeDiff
603    /// |TreeDiff ∩ Amendment| / |Amendment|
604    pub recall: f32,
605    /// Number of words that matched between TreeDiff and Amendment
606    pub matched_words: i32,
607    /// Total significant words in the TreeDiff's changes
608    pub tree_diff_words: i32,
609}
610
611/// A match found when scanning amendment text for section mentions.
612///
613/// When scanning bill amendments against a TreeDiff's regexes, this struct
614/// captures each match, linking the structural path from the TreeDiff to
615/// the text that matched in the amendment.
616#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
617pub struct MentionMatch {
618    /// The structural path from the TreeDiff that generated this match
619    pub tree_diff_path: String,
620    /// The text that matched the regex pattern
621    pub matched_text: String,
622}
623
624/// Check if a word is a stop word (case-insensitive)
625fn is_stop_word(word: &str) -> bool {
626    let lower = word.to_lowercase();
627    STOP_WORDS.contains(&lower.as_str())
628}
629
630/// Compute field-level changes between two elements
631///
632/// Compares all five text content fields (Heading, Chapeau, Proviso, Content,
633/// Continuation) between two versions of the same element and returns change
634/// events for any fields that differ.
635///
636/// # Arguments
637///
638/// * `element_a` - The original version of the element
639/// * `element_b` - The new version of the element
640///
641/// # Returns
642///
643/// A vector of `FieldChangeEvent` for each field that has changes.
644/// Fields that are identical in both versions are omitted.
645///
646/// # Panics
647///
648/// Panics if the elements have different paths or types.
649pub fn diff_elements(element_a: &USLMElement, element_b: &USLMElement) -> Vec<FieldChangeEvent> {
650    assert!(element_a.data.path == element_b.data.path);
651    assert!(element_a.data.element_type == element_b.data.element_type);
652    let mut changes: Vec<FieldChangeEvent> = Vec::new();
653    for field_name in [
654        TextContentField::Heading,
655        TextContentField::Chapeau,
656        TextContentField::Proviso,
657        TextContentField::Content,
658        TextContentField::Continuation,
659    ]
660    .into_iter()
661    {
662        let field_changes = diff_field(element_a, element_b, field_name);
663        // Only include if there are actual changes (Insert or Delete), not just Equal
664        let has_real_changes = field_changes
665            .changes
666            .iter()
667            .any(|c| c.tag != TextChangeType::Equal);
668        if has_real_changes {
669            changes.push(field_changes);
670        }
671    }
672    changes
673}
674
675// databases don't like usizes, make it an i32
676// text content will never exceed the i32 range
677fn rewrap_usize(s: Option<usize>) -> Option<i32> {
678    s.map(|val| val as i32)
679}
680
681fn diff_field(
682    element_a: &USLMElement,
683    element_b: &USLMElement,
684    field_name: TextContentField,
685) -> FieldChangeEvent {
686    let a = element_a
687        .data
688        .get_text_content(field_name)
689        .unwrap_or_default();
690    let b = element_b
691        .data
692        .get_text_content(field_name)
693        .unwrap_or_default();
694
695    let diff = TextDiff::from_words(a.as_str(), b.as_str());
696    let changes: Vec<TextChange> = diff
697        .iter_all_changes()
698        // Keep all changes including Equal for word-level diff rendering
699        .map(|c| {
700            let tag = match c.tag() {
701                ChangeTag::Delete => TextChangeType::Delete,
702                ChangeTag::Insert => TextChangeType::Insert,
703                ChangeTag::Equal => TextChangeType::Equal,
704            };
705            TextChange {
706                value: String::from(c.value()),
707                old_index: rewrap_usize(c.old_index()),
708                new_index: rewrap_usize(c.new_index()),
709                tag,
710            }
711        })
712        .collect();
713    FieldChangeEvent {
714        field_name,
715        from_date: element_a.data.date,
716        to_date: element_b.data.date,
717        old_value: a,
718        new_value: b,
719        changes,
720    }
721}