Skip to main content

words_to_data/uslm/
mod.rs

1use std::str::FromStr;
2
3use serde::{Deserialize, Serialize};
4use thiserror::Error;
5use time::Date;
6
7pub mod bill_parser;
8pub mod parser;
9pub mod path;
10
11/// Errors that can occur when parsing or processing USLM documents
12#[derive(Error, Debug)]
13pub enum USLMError {
14    /// An unknown or unsupported document type was encountered
15    #[error("Unknown Document Type {0}")]
16    UnknownDocumentType(String),
17
18    /// An unknown or unsupported amending action was encountered
19    #[error("Unknown Amending Action {0}")]
20    UnknownAmendingAction(String),
21}
22
23/// The type of legislative document being parsed
24///
25/// USLM documents can be either US Code titles or Bills (such as Public Laws).
26/// Each type has associated metadata that provides additional context.
27#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
28#[serde(rename_all = "snake_case")]
29pub enum DocumentType {
30    /// United States Code document (e.g., Title 7, Title 26)
31    #[serde(rename = "us_code")]
32    USCode {
33        /// The specific type of USC document (Title or TitleAppendix)
34        usc_type: USCType,
35    },
36
37    /// Bill document (e.g., Public Law)
38    Bill {
39        /// The type of bill (currently only PublicLaw is supported)
40        bill_type: BillType,
41        /// The bill identifier (e.g., "119-21" for the 119th Congress, 21st law)
42        bill_id: String,
43    },
44}
45
46impl DocumentType {
47    /// Parse a document type from string representation with optional metadata
48    ///
49    /// # Arguments
50    ///
51    /// * `s` - The document type string (case-insensitive). Accepted values:
52    ///   - For USC: "uscode", "us_code", "uscdoc"
53    ///   - For Bills: "publiclaw", "public_law", "plaw"
54    /// * `meta_str` - Additional metadata required for type disambiguation:
55    ///   - For USC: "usctitle" or "usctitleappendix"
56    ///   - For Bills: the bill ID (e.g., "119-21")
57    ///
58    /// # Returns
59    ///
60    /// Returns `Ok(DocumentType)` if parsing succeeds, or `Err(USLMError)` if:
61    /// - The document type string is not recognized
62    /// - Required metadata is missing
63    /// - The metadata value is invalid
64    ///
65    /// # Examples
66    ///
67    /// ```
68    /// use words_to_data::uslm::{DocumentType, USCType, BillType};
69    ///
70    /// // Parse a USC Title
71    /// let usc = DocumentType::from_str("uscode", Some("usctitle")).unwrap();
72    /// match usc {
73    ///     DocumentType::USCode { usc_type } => assert_eq!(usc_type, USCType::Title),
74    ///     _ => panic!("Expected USCode variant"),
75    /// }
76    ///
77    /// // Parse a Public Law
78    /// let bill = DocumentType::from_str("publiclaw", Some("119-21")).unwrap();
79    /// match bill {
80    ///     DocumentType::Bill { bill_type, bill_id } => {
81    ///         assert_eq!(bill_type, BillType::PublicLaw);
82    ///         assert_eq!(bill_id, "119-21");
83    ///     },
84    ///     _ => panic!("Expected Bill variant"),
85    /// }
86    /// ```
87    pub fn from_str(s: &str, meta_str: Option<&str>) -> Result<Self, USLMError> {
88        match s.to_lowercase().as_str() {
89            "publiclaw" | "public_law" | "plaw" => match meta_str {
90                Some(val) => Ok(Self::Bill {
91                    bill_type: BillType::PublicLaw,
92                    bill_id: val.to_string(),
93                }),
94                None => Err(USLMError::UnknownDocumentType(
95                    "Bill types must pass the bill_id as the meta_str parameter".to_string(),
96                )),
97            },
98            "uscode" | "us_code" | "uscdoc" => match meta_str {
99                Some(val) => match val.to_lowercase().as_str() {
100                    "usctitle" => Ok(DocumentType::USCode {
101                        usc_type: USCType::Title,
102                    }),
103                    "usctitleappendix" => Ok(DocumentType::USCode {
104                        usc_type: USCType::TitleAppendix,
105                    }),
106                    _ => Err(USLMError::UnknownDocumentType(format!(
107                        "Unhandled type for USCode document: {}",
108                        val.to_lowercase()
109                    ))),
110                },
111                None => Err(USLMError::UnknownDocumentType(
112                    "USCode types need to provide a type_str".to_string(),
113                )),
114            },
115            _ => Err(USLMError::UnknownDocumentType(s.to_string())),
116        }
117    }
118}
119
120/// The type of bill document
121///
122/// Currently only Public Laws are supported, but this enum allows for
123/// future expansion to support other bill types.
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
125#[serde(rename_all = "snake_case")]
126pub enum BillType {
127    /// A Public Law (enacted legislation)
128    PublicLaw,
129}
130
131/// The type of United States Code document
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134pub enum USCType {
135    /// The entire US Code (container for all titles at a point in time)
136    #[serde(rename = "us_code")]
137    USCode,
138    /// A standard USC Title
139    /// TODO remove this in favor of USCode
140    Title,
141    /// An appendix to a USC Title
142    TitleAppendix,
143}
144
145/// The hierarchical type of an element within a legislative document
146///
147/// Legislative documents follow a strict hierarchy with various levels of organization.
148/// This enum represents all possible element types that can appear in USLM documents.
149///
150/// # Hierarchy Examples
151///
152/// For US Code:
153/// - Title > Subtitle > Chapter > Subchapter > Part > Section > Subsection > Paragraph
154///
155/// For Bills:
156/// - Division > Title > Subtitle > Chapter > Section > Subsection > Paragraph
157///
158/// The `Level` type is a special structural element used when the hierarchy
159/// doesn't follow the standard pattern. `Unknown` is used for unrecognized elements.
160#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
161#[serde(rename_all = "snake_case")]
162pub enum ElementType {
163    /// The root element of a US Code document
164    #[serde(rename = "us_code_document")]
165    USCodeDocument,
166    /// The root element of a Public Law document
167    PublicLawDocument,
168    /// A Title (top level division in USC, or subdivision in bills)
169    Title,
170    /// An Appendix to a title or section
171    Appendix,
172    /// A Subtitle (subdivision of a title)
173    Subtitle,
174    /// A Chapter (major subdivision)
175    Chapter,
176    /// A Subchapter (subdivision of a chapter)
177    Subchapter,
178    /// A Part (subdivision, often of a subchapter)
179    Part,
180    /// A Subpart (subdivision of a part)
181    Subpart,
182    /// A Section (the primary unit of law, e.g., "Section 174")
183    Section,
184    /// A Subsection (subdivision of a section, often lettered: a, b, c)
185    Subsection,
186    /// A Paragraph (subdivision of a subsection, often numbered: 1, 2, 3)
187    Paragraph,
188    /// A Subparagraph (subdivision of a paragraph, often lettered: A, B, C)
189    Subparagraph,
190    /// A Clause (subdivision of a subparagraph, often numbered: i, ii, iii)
191    Clause,
192    /// A Subclause (subdivision of a clause)
193    Subclause,
194    /// A Level element (generic structural container when hierarchy is non-standard)
195    Level,
196    /// An Item in an enumerated list
197    Item,
198    /// A Subitem (subdivision of an item)
199    Subitem,
200    /// A Subsubitem (subdivision of a subitem)
201    Subsubitem,
202    /// A Division (top-level subdivision in some bills)
203    Division,
204    /// A Subdivision
205    Subdivision,
206    /// An unknown or unrecognized element type
207    Unknown,
208}
209
210impl std::str::FromStr for ElementType {
211    type Err = USLMError;
212
213    /// Parse an element type from its string representation
214    ///
215    /// This implementation is case-insensitive and accepts various common names
216    /// for element types. Unknown strings are mapped to `ElementType::Unknown`
217    /// rather than returning an error.
218    fn from_str(s: &str) -> Result<ElementType, USLMError> {
219        match s.to_lowercase().as_str() {
220            "title" => Ok(Self::Title),
221            "subtitle" => Ok(Self::Subtitle),
222            "chapter" => Ok(Self::Chapter),
223            "subchapter" => Ok(Self::Subchapter),
224            "part" => Ok(Self::Part),
225            "subpart" => Ok(Self::Subpart),
226            "section" => Ok(Self::Section),
227            "subsection" => Ok(Self::Subsection),
228            "paragraph" => Ok(Self::Paragraph),
229            "subparagraph" => Ok(Self::Subparagraph),
230            "clause" => Ok(Self::Clause),
231            "subclause" => Ok(Self::Subclause),
232            "level" => Ok(Self::Level),
233            "item" => Ok(Self::Item),
234            "subitem" => Ok(Self::Subitem),
235            "subsubitem" => Ok(Self::Subsubitem),
236            "division" => Ok(Self::Division),
237            "subdivision" => Ok(Self::Subdivision),
238            "publiclaw" | "public_law" | "plaw" => Ok(Self::PublicLawDocument),
239            "uscode" | "us_code" | "uscdoc" => Ok(Self::USCodeDocument),
240            "appendix" => Ok(Self::Appendix),
241            _ => Ok(Self::Unknown),
242        }
243    }
244}
245
246/// The different text content fields that can be present in a legislative element
247///
248/// Legislative elements can have up to five distinct text fields, each serving
249/// a specific purpose in the document structure. These fields are tracked
250/// separately to enable precise change detection when comparing document versions.
251/// One of five text fields that can appear in an element:
252///
253/// - Heading: Opening text that appears before enumerated sub-elements
254/// - Chapeau: A conditional or qualifying clause (often starting with "Provided that")
255/// - Proviso: The main text content of the element
256/// - Content: Text that appears after all child elements
257/// - Continuation: Text that appears after all child elements
258///
259/// **IMPORTANT**: Becuase continuations appear _after_ child elements, the full text of some elements require child elements to be present. This makes sense, to load a full section, you need the subsections, which need paragraphs which may need clauses, etc.
260#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
261#[serde(rename_all = "snake_case")]
262pub enum TextContentField {
263    /// The heading or title of the element (e.g., "Agricultural Programs")
264    Heading,
265    /// Opening text that appears before enumerated sub-elements
266    Chapeau,
267    /// A conditional or qualifying clause (often starting with "Provided that")
268    Proviso,
269    /// The main text content of the element
270    Content,
271    /// Text that appears after all child elements
272    Continuation,
273}
274
275/// Types of amendments that can be made to existing law via a bill
276///
277/// When a bill modifies existing United States Code, it uses specific
278/// amending actions to describe the type of change being made.
279#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
280#[serde(rename_all = "snake_case")]
281pub enum AmendingAction {
282    /// Modify existing text
283    Amend,
284    /// Add new text or sections
285    Add,
286    /// Remove existing text or sections
287    Delete,
288    /// Insert new text at a specific location
289    Insert,
290    /// Change the designation or numbering of sections
291    Redesignate,
292    /// Remove an entire section or provision from the law
293    Repeal,
294    /// Relocate an element (may include redesignation)
295    Move,
296    /// Remove specific text within an element (finer than Delete)
297    Strike,
298    /// Remove specific text and replace with new text
299    StrikeAndInsert,
300}
301
302impl FromStr for AmendingAction {
303    type Err = USLMError;
304
305    /// Parse an amending action from its string representation
306    ///
307    /// This implementation is case-insensitive. Returns an error if the
308    /// action type is not recognized.
309    fn from_str(s: &str) -> std::result::Result<Self, <Self as std::str::FromStr>::Err> {
310        match s.to_lowercase().as_str() {
311            "amend" => Ok(AmendingAction::Amend),
312            "add" => Ok(AmendingAction::Add),
313            "delete" => Ok(AmendingAction::Delete),
314            "insert" => Ok(AmendingAction::Insert),
315            "redesignate" => Ok(AmendingAction::Redesignate),
316            "repeal" => Ok(AmendingAction::Repeal),
317            "move" => Ok(AmendingAction::Move),
318            "strike" => Ok(AmendingAction::Strike),
319            "strikeandinsert" | "strike_and_insert" => Ok(AmendingAction::StrikeAndInsert),
320            _ => Err(USLMError::UnknownAmendingAction(s.to_lowercase())),
321        }
322    }
323}
324
325impl AmendingAction {
326    /// Extract all text from a node and its descendants
327    #[allow(dead_code)]
328    fn extract_all_text(node: &roxmltree::Node) -> String {
329        let mut text = String::new();
330        for descendant in node.descendants() {
331            if let Some(t) = descendant.text() {
332                if !text.is_empty() {
333                    text.push(' ');
334                }
335                text.push_str(t);
336            }
337        }
338        text
339    }
340}
341
342/// A reference to a USC section found in a bill
343#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
344pub struct UscReference {
345    /// The USLM path being referenced (e.g., "/us/usc/t7/s2025/c/1/A/ii")
346    pub path: String,
347    /// The human-readable text of the reference (e.g., "7 U.S.C. 2025(c)(1)(A)(ii)")
348    pub display_text: String,
349}
350
351/// An amending action found in a bill
352#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
353pub struct BillAmendment {
354    /// Content-based ID: sha256("{bill_id}:{amending_text}")
355    /// This provides a stable, deterministic identifier that works regardless of source format.
356    pub id: String,
357
358    /// Type of action (amend, add, delete, insert, redesignate, repeal)
359    pub action_types: Vec<AmendingAction>,
360
361    /// The text of the change
362    pub amending_text: String,
363
364    /// List of word-level changes that an amendment enacts
365    pub changes: Vec<BillDiff>,
366}
367
368impl BillAmendment {
369    pub fn update_changes(&self, changes: &[BillDiff]) -> Self {
370        BillAmendment {
371            id: self.id.clone(),
372            action_types: self.action_types.clone(),
373            amending_text: self.amending_text.clone(),
374            changes: changes.to_vec(),
375        }
376    }
377}
378
379/// Actions caused by a bill amendment
380///
381/// This is designed to exist as single entries for every logical
382/// amending action. For example, given the following amending text:
383/// ```ignore
384///(B)
385/// in subsection (b)--
386///
387///   (i)
388///   by striking "specified research" and inserting "foreign research",
389///
390///
391///   (ii)
392///   by inserting "and which are attributable to foreign research (within the meaning of section 41(d)(4)(F))" before the period at the end, and
393/// ```
394/// we would annotate that with two Bill Diffs:
395/// ```ignore
396/// {
397///  "removed": ["specified"],
398///  "added": ["foreign"]
399/// }
400/// ```
401/// and
402/// ```ignore
403/// {
404///  "removed": [],
405///  "added": [
406///    "which",
407///    "attributable",
408///    "foreign",
409///    "research",
410///    "(within",
411///    "meaning",
412///    "section",
413///    "41(d)(4)(F))"
414///  ]
415///}
416/// ```
417#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
418pub struct BillDiff {
419    pub added: Vec<String>,
420    pub removed: Vec<String>,
421}
422
423/// Source Credit Attribution
424///
425/// The Source credit can contain multiple `<ref>` elements, and they are separated logically
426/// as new sources by a `;` between them in the XML. So when you encounter a `<SourceCredit>` element,
427/// you should split the element into multiple `<SourceCredit>` elements, each with a single `<ref>` element.
428///
429/// **IMPORTANT**: Source credits point to USLM ID shaped paths, for example:
430/// ```xml
431/// <sourceCredit id="id2ffb3c99-76ce-11f0-a3ab-d79a777afc56">(<ref href="/us/act/1954-08-16/ch736">Aug. 16, 1954, ch. 736</ref>, <ref href="/us/stat/68A/3">68A Stat. 3</ref>; <ref href="/us/pl/99/514/s2">Pub. L. 99–514, § 2</ref>, <date date="1986-10-22">Oct. 22, 1986</date>, <ref href="/us/stat/100/2095">100 Stat. 2095</ref>.)</sourceCredit>
432/// ```
433/// They do not actually state change information, and the source credits are not guaranteed to cover all the bills that provided changes to the document. They are better thought of as an incomplete list of pointers. While useful, it is easy to confuse these with the full, definitive listing of bills that created the Element.
434#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
435#[serde(rename_all = "snake_case")]
436pub struct SourceCredit {
437    /// The `<ref>` elements of the source credit
438    pub ref_pairs: Vec<RefPair>,
439}
440
441/// A reference pair within a source credit
442///
443/// Contains the identifier and description for a single reference within
444/// a source credit attribution.
445#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
446#[serde(rename_all = "snake_case")]
447pub struct RefPair {
448    /// The ID of the `<ref>` source credit
449    pub ref_id: String,
450    /// The description of the source credit
451    pub description: String,
452}
453
454/// Metadata and content for a single element in a USLM document
455///
456/// This struct contains all the information about a legislative element,
457/// including its position in the document hierarchy, identification paths,
458/// display information, and text content.
459///
460/// # Path Systems
461///
462/// Each element has two types of paths:
463///
464/// 1. **Structural Path** (`path`): Includes all hierarchy elements, even
465///    non-USLM ones like `Level`. Example:
466///    `uscode/title_26/subtitle_k/chapter_100/section_9834/level_1`
467///
468/// 2. **USLM ID** (`uslm_id`): Official USLM identifier following standard format.
469///    Only present for elements in the USLM scheme. Example: `/us/usc/t26/s9834/a/1`
470///
471/// Combining the structural path with the date provides a unique identifier for
472/// any element across all versions of the document.
473#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
474#[serde(rename_all = "snake_case")]
475pub struct ElementData {
476    /// The full structural path in the document for the element
477    ///
478    /// This includes all structural elements like Level that may not be part of the USLM identifier.
479    /// Note that combining this with the date field gives a unique identifier for the document
480    /// For example:
481    ///
482    /// uscode/title_26/subtitle_k/chapter_100/subchapter_c/section_9834/level_1
483    ///
484    pub path: String,
485
486    /// The type of this element in the legislative hierarchy
487    pub element_type: ElementType,
488
489    /// The type of document this element belongs to
490    pub document_type: DocumentType,
491
492    /// The date this version of the document was published
493    pub date: Date,
494
495    // Display
496    /// The raw number or identifier value (e.g., "174", "a", "1")
497    pub number_value: String,
498
499    /// The formatted display version of the number (may include prefixes/suffixes)
500    pub number_display: String,
501
502    /// A human-readable name for this element (e.g., "Section 174")
503    pub verbose_name: String,
504
505    // Content Fields
506    // These are the fields that we need to diff upon
507    /// The heading or title text of the element
508    pub heading: Option<String>,
509
510    /// The words at the start of the element that appear before any enumerated items
511    pub chapeau: Option<String>,
512
513    /// A clause imposing a qualification, condition, or restriction
514    pub proviso: Option<String>,
515
516    /// The main text content of the element
517    pub content: Option<String>,
518
519    /// Text content that appears after all child elements
520    pub continuation: Option<String>,
521
522    // Metadata
523    /// The USLM-standard identifier path for this element
524    ///
525    /// This follows the official USLM path format and excludes structural-only elements.
526    /// For example: `/us/usc/t26/s1/a/1` or `/us/pl/119-21/s1/a`
527    ///
528    /// This is computed according to USLM standards for elements that are part of the
529    /// USLM identifier scheme. If the XML provides an `identifier` attribute, it is
530    /// validated to match this generated path.
531    ///
532    /// Structural-only elements like Level will have None here, as they are not part
533    /// of the USLM identifier scheme.
534    pub uslm_id: Option<String>,
535
536    /// The USLM `id` attribute for an element
537    ///
538    /// Takes the form of a UUID, not guaranteed to exist
539    pub uslm_uuid: Option<String>,
540
541    /// Source credits and references for this element
542    pub source_credits: Vec<SourceCredit>,
543    //pub page_data: Option<PageData>, // TODO implement
544}
545
546impl ElementData {
547    /// Retrieve the text content for a specific field
548    ///
549    /// # Arguments
550    ///
551    /// * `field` - The text content field to retrieve
552    ///
553    /// # Returns
554    ///
555    /// Returns `Some(String)` if the field has content, or `None` if the field
556    /// is empty for this element.
557    pub fn get_text_content(&self, field: TextContentField) -> Option<String> {
558        match field {
559            TextContentField::Heading => self.heading.clone(),
560            TextContentField::Chapeau => self.chapeau.clone(),
561            TextContentField::Proviso => self.proviso.clone(),
562            TextContentField::Content => self.content.clone(),
563            TextContentField::Continuation => self.continuation.clone(),
564        }
565    }
566}
567
568/// A hierarchical element in a USLM document tree
569///
570/// This struct represents a single element in a legislative document along with
571/// all of its child elements, forming a tree structure that mirrors the document's
572/// hierarchical organization.
573///
574/// # Structure
575///
576/// - `data`: Contains all metadata and text content for this element
577/// - `children`: All direct child elements in document order
578///
579/// # Examples
580///
581/// A typical USC section might have a structure like:
582///
583/// ```text
584/// Section 174 (USLMElement)
585///   ├─ data: ElementData { element_type: Section, heading: "Research expenditures", ... }
586///   └─ children:
587///       ├─ Subsection (a) (USLMElement)
588///       │   └─ children: [Paragraph (1), Paragraph (2), ...]
589///       └─ Subsection (b) (USLMElement)
590///           └─ children: [...]
591/// ```
592///
593/// # Tree Navigation
594///
595/// Use the `find()` method to locate specific elements within the tree by their
596/// structural path.
597#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
598#[serde(rename_all = "snake_case")]
599pub struct USLMElement {
600    /// The metadata and content for this element
601    pub data: ElementData,
602
603    /// Child elements in document order
604    pub children: Vec<USLMElement>,
605}
606
607impl USLMElement {
608    /// Search for an element by its structural path
609    ///
610    /// Recursively searches this element and all descendants for an element
611    /// with the specified path. The path must be a fully qualified structural
612    /// path (e.g., "uscode/title_7/chapter_1/section_1").
613    ///
614    /// # Arguments
615    ///
616    /// * `path` - The full structural path of the element to find
617    ///
618    /// # Returns
619    ///
620    /// Returns `Some(&USLMElement)` if an element with the matching path is found,
621    /// or `None` if no such element exists in this tree.
622    ///
623    /// # Examples
624    ///
625    /// ```
626    /// # use words_to_data::uslm::parser::parse;
627    /// # let element = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
628    /// // Find a specific section
629    /// let section = element.find("uscode/title_7/chapter_1/section_2");
630    /// assert!(section.is_some());
631    ///
632    /// // Non-existent path returns None
633    /// let missing = element.find("uscode/title_99");
634    /// assert!(missing.is_none());
635    /// ```
636    pub fn find(&self, path: &str) -> Option<&USLMElement> {
637        if path == self.data.path.as_str() {
638            return Some(self);
639        }
640        let remaining_path = path.strip_prefix(self.data.path.as_str())?;
641        let next_step: Vec<&str> = remaining_path.split("/").collect();
642        assert!(next_step.len() > 1);
643
644        let child_id = next_step[1];
645        let child_vec: Vec<&USLMElement> = self
646            .children
647            .iter()
648            .filter(|c| c.data.path.ends_with(child_id))
649            .collect();
650        if child_vec.is_empty() {
651            None
652        } else {
653            assert!(child_vec.len() == 1);
654            child_vec[0].find(path)
655        }
656    }
657
658    /// Merge the children of one node into another
659    pub fn merge_children_mut(&mut self, other: &mut USLMElement) {
660        self.children.append(&mut other.children);
661    }
662}