words_to_data/uslm/mod.rs
1use std::str::FromStr;
2
3use serde::{Deserialize, Serialize};
4use thiserror::Error;
5use time::Date;
6
7pub mod bill_parser;
8pub mod parser;
9pub mod path;
10
11/// Errors that can occur when parsing or processing USLM documents
12#[derive(Error, Debug)]
13pub enum USLMError {
14 /// An unknown or unsupported document type was encountered
15 #[error("Unknown Document Type {0}")]
16 UnknownDocumentType(String),
17
18 /// An unknown or unsupported amending action was encountered
19 #[error("Unknown Amending Action {0}")]
20 UnknownAmendingAction(String),
21}
22
23/// The type of legislative document being parsed
24///
25/// USLM documents can be either US Code titles or Bills (such as Public Laws).
26/// Each type has associated metadata that provides additional context.
27#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
28#[serde(rename_all = "snake_case")]
29pub enum DocumentType {
30 /// United States Code document (e.g., Title 7, Title 26)
31 #[serde(rename = "us_code")]
32 USCode {
33 /// The specific type of USC document (Title or TitleAppendix)
34 usc_type: USCType,
35 },
36
37 /// Bill document (e.g., Public Law)
38 Bill {
39 /// The type of bill (currently only PublicLaw is supported)
40 bill_type: BillType,
41 /// The bill identifier (e.g., "119-21" for the 119th Congress, 21st law)
42 bill_id: String,
43 },
44}
45
46impl DocumentType {
47 /// Parse a document type from string representation with optional metadata
48 ///
49 /// # Arguments
50 ///
51 /// * `s` - The document type string (case-insensitive). Accepted values:
52 /// - For USC: "uscode", "us_code", "uscdoc"
53 /// - For Bills: "publiclaw", "public_law", "plaw"
54 /// * `meta_str` - Additional metadata required for type disambiguation:
55 /// - For USC: "usctitle" or "usctitleappendix"
56 /// - For Bills: the bill ID (e.g., "119-21")
57 ///
58 /// # Returns
59 ///
60 /// Returns `Ok(DocumentType)` if parsing succeeds, or `Err(USLMError)` if:
61 /// - The document type string is not recognized
62 /// - Required metadata is missing
63 /// - The metadata value is invalid
64 ///
65 /// # Examples
66 ///
67 /// ```
68 /// use words_to_data::uslm::{DocumentType, USCType, BillType};
69 ///
70 /// // Parse a USC Title
71 /// let usc = DocumentType::from_str("uscode", Some("usctitle")).unwrap();
72 /// match usc {
73 /// DocumentType::USCode { usc_type } => assert_eq!(usc_type, USCType::Title),
74 /// _ => panic!("Expected USCode variant"),
75 /// }
76 ///
77 /// // Parse a Public Law
78 /// let bill = DocumentType::from_str("publiclaw", Some("119-21")).unwrap();
79 /// match bill {
80 /// DocumentType::Bill { bill_type, bill_id } => {
81 /// assert_eq!(bill_type, BillType::PublicLaw);
82 /// assert_eq!(bill_id, "119-21");
83 /// },
84 /// _ => panic!("Expected Bill variant"),
85 /// }
86 /// ```
87 pub fn from_str(s: &str, meta_str: Option<&str>) -> Result<Self, USLMError> {
88 match s.to_lowercase().as_str() {
89 "publiclaw" | "public_law" | "plaw" => match meta_str {
90 Some(val) => Ok(Self::Bill {
91 bill_type: BillType::PublicLaw,
92 bill_id: val.to_string(),
93 }),
94 None => Err(USLMError::UnknownDocumentType(
95 "Bill types must pass the bill_id as the meta_str parameter".to_string(),
96 )),
97 },
98 "uscode" | "us_code" | "uscdoc" => match meta_str {
99 Some(val) => match val.to_lowercase().as_str() {
100 "usctitle" => Ok(DocumentType::USCode {
101 usc_type: USCType::Title,
102 }),
103 "usctitleappendix" => Ok(DocumentType::USCode {
104 usc_type: USCType::TitleAppendix,
105 }),
106 _ => Err(USLMError::UnknownDocumentType(format!(
107 "Unhandled type for USCode document: {}",
108 val.to_lowercase()
109 ))),
110 },
111 None => Err(USLMError::UnknownDocumentType(
112 "USCode types need to provide a type_str".to_string(),
113 )),
114 },
115 _ => Err(USLMError::UnknownDocumentType(s.to_string())),
116 }
117 }
118}
119
120/// The type of bill document
121///
122/// Currently only Public Laws are supported, but this enum allows for
123/// future expansion to support other bill types.
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
125#[serde(rename_all = "snake_case")]
126pub enum BillType {
127 /// A Public Law (enacted legislation)
128 PublicLaw,
129}
130
131/// The type of United States Code document
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
133#[serde(rename_all = "snake_case")]
134pub enum USCType {
135 /// The entire US Code (container for all titles at a point in time)
136 #[serde(rename = "us_code")]
137 USCode,
138 /// A standard USC Title
139 /// TODO remove this in favor of USCode
140 Title,
141 /// An appendix to a USC Title
142 TitleAppendix,
143}
144
145/// The hierarchical type of an element within a legislative document
146///
147/// Legislative documents follow a strict hierarchy with various levels of organization.
148/// This enum represents all possible element types that can appear in USLM documents.
149///
150/// # Hierarchy Examples
151///
152/// For US Code:
153/// - Title > Subtitle > Chapter > Subchapter > Part > Section > Subsection > Paragraph
154///
155/// For Bills:
156/// - Division > Title > Subtitle > Chapter > Section > Subsection > Paragraph
157///
158/// The `Level` type is a special structural element used when the hierarchy
159/// doesn't follow the standard pattern. `Unknown` is used for unrecognized elements.
160#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
161#[serde(rename_all = "snake_case")]
162pub enum ElementType {
163 /// The root element of a US Code document
164 #[serde(rename = "us_code_document")]
165 USCodeDocument,
166 /// The root element of a Public Law document
167 PublicLawDocument,
168 /// A Title (top level division in USC, or subdivision in bills)
169 Title,
170 /// An Appendix to a title or section
171 Appendix,
172 /// A Subtitle (subdivision of a title)
173 Subtitle,
174 /// A Chapter (major subdivision)
175 Chapter,
176 /// A Subchapter (subdivision of a chapter)
177 Subchapter,
178 /// A Part (subdivision, often of a subchapter)
179 Part,
180 /// A Subpart (subdivision of a part)
181 Subpart,
182 /// A Section (the primary unit of law, e.g., "Section 174")
183 Section,
184 /// A Subsection (subdivision of a section, often lettered: a, b, c)
185 Subsection,
186 /// A Paragraph (subdivision of a subsection, often numbered: 1, 2, 3)
187 Paragraph,
188 /// A Subparagraph (subdivision of a paragraph, often lettered: A, B, C)
189 Subparagraph,
190 /// A Clause (subdivision of a subparagraph, often numbered: i, ii, iii)
191 Clause,
192 /// A Subclause (subdivision of a clause)
193 Subclause,
194 /// A Level element (generic structural container when hierarchy is non-standard)
195 Level,
196 /// An Item in an enumerated list
197 Item,
198 /// A Subitem (subdivision of an item)
199 Subitem,
200 /// A Subsubitem (subdivision of a subitem)
201 Subsubitem,
202 /// A Division (top-level subdivision in some bills)
203 Division,
204 /// A Subdivision
205 Subdivision,
206 /// An unknown or unrecognized element type
207 Unknown,
208}
209
210impl std::str::FromStr for ElementType {
211 type Err = USLMError;
212
213 /// Parse an element type from its string representation
214 ///
215 /// This implementation is case-insensitive and accepts various common names
216 /// for element types. Unknown strings are mapped to `ElementType::Unknown`
217 /// rather than returning an error.
218 fn from_str(s: &str) -> Result<ElementType, USLMError> {
219 match s.to_lowercase().as_str() {
220 "title" => Ok(Self::Title),
221 "subtitle" => Ok(Self::Subtitle),
222 "chapter" => Ok(Self::Chapter),
223 "subchapter" => Ok(Self::Subchapter),
224 "part" => Ok(Self::Part),
225 "subpart" => Ok(Self::Subpart),
226 "section" => Ok(Self::Section),
227 "subsection" => Ok(Self::Subsection),
228 "paragraph" => Ok(Self::Paragraph),
229 "subparagraph" => Ok(Self::Subparagraph),
230 "clause" => Ok(Self::Clause),
231 "subclause" => Ok(Self::Subclause),
232 "level" => Ok(Self::Level),
233 "item" => Ok(Self::Item),
234 "subitem" => Ok(Self::Subitem),
235 "subsubitem" => Ok(Self::Subsubitem),
236 "division" => Ok(Self::Division),
237 "subdivision" => Ok(Self::Subdivision),
238 "publiclaw" | "public_law" | "plaw" => Ok(Self::PublicLawDocument),
239 "uscode" | "us_code" | "uscdoc" => Ok(Self::USCodeDocument),
240 "appendix" => Ok(Self::Appendix),
241 _ => Ok(Self::Unknown),
242 }
243 }
244}
245
246/// The different text content fields that can be present in a legislative element
247///
248/// Legislative elements can have up to five distinct text fields, each serving
249/// a specific purpose in the document structure. These fields are tracked
250/// separately to enable precise change detection when comparing document versions.
251/// One of five text fields that can appear in an element:
252///
253/// - Heading: Opening text that appears before enumerated sub-elements
254/// - Chapeau: A conditional or qualifying clause (often starting with "Provided that")
255/// - Proviso: The main text content of the element
256/// - Content: Text that appears after all child elements
257/// - Continuation: Text that appears after all child elements
258///
259/// **IMPORTANT**: Becuase continuations appear _after_ child elements, the full text of some elements require child elements to be present. This makes sense, to load a full section, you need the subsections, which need paragraphs which may need clauses, etc.
260#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
261#[serde(rename_all = "snake_case")]
262pub enum TextContentField {
263 /// The heading or title of the element (e.g., "Agricultural Programs")
264 Heading,
265 /// Opening text that appears before enumerated sub-elements
266 Chapeau,
267 /// A conditional or qualifying clause (often starting with "Provided that")
268 Proviso,
269 /// The main text content of the element
270 Content,
271 /// Text that appears after all child elements
272 Continuation,
273}
274
275/// Types of amendments that can be made to existing law via a bill
276///
277/// When a bill modifies existing United States Code, it uses specific
278/// amending actions to describe the type of change being made.
279#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
280#[serde(rename_all = "snake_case")]
281pub enum AmendingAction {
282 /// Modify existing text
283 Amend,
284 /// Add new text or sections
285 Add,
286 /// Remove existing text or sections
287 Delete,
288 /// Insert new text at a specific location
289 Insert,
290 /// Change the designation or numbering of sections
291 Redesignate,
292 /// Remove an entire section or provision from the law
293 Repeal,
294 /// Relocate an element (may include redesignation)
295 Move,
296 /// Remove specific text within an element (finer than Delete)
297 Strike,
298 /// Remove specific text and replace with new text
299 StrikeAndInsert,
300}
301
302impl FromStr for AmendingAction {
303 type Err = USLMError;
304
305 /// Parse an amending action from its string representation
306 ///
307 /// This implementation is case-insensitive. Returns an error if the
308 /// action type is not recognized.
309 fn from_str(s: &str) -> std::result::Result<Self, <Self as std::str::FromStr>::Err> {
310 match s.to_lowercase().as_str() {
311 "amend" => Ok(AmendingAction::Amend),
312 "add" => Ok(AmendingAction::Add),
313 "delete" => Ok(AmendingAction::Delete),
314 "insert" => Ok(AmendingAction::Insert),
315 "redesignate" => Ok(AmendingAction::Redesignate),
316 "repeal" => Ok(AmendingAction::Repeal),
317 "move" => Ok(AmendingAction::Move),
318 "strike" => Ok(AmendingAction::Strike),
319 "strikeandinsert" | "strike_and_insert" => Ok(AmendingAction::StrikeAndInsert),
320 _ => Err(USLMError::UnknownAmendingAction(s.to_lowercase())),
321 }
322 }
323}
324
325impl AmendingAction {
326 /// Extract all text from a node and its descendants
327 #[allow(dead_code)]
328 fn extract_all_text(node: &roxmltree::Node) -> String {
329 let mut text = String::new();
330 for descendant in node.descendants() {
331 if let Some(t) = descendant.text() {
332 if !text.is_empty() {
333 text.push(' ');
334 }
335 text.push_str(t);
336 }
337 }
338 text
339 }
340}
341
342/// A reference to a USC section found in a bill
343#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
344pub struct UscReference {
345 /// The USLM path being referenced (e.g., "/us/usc/t7/s2025/c/1/A/ii")
346 pub path: String,
347 /// The human-readable text of the reference (e.g., "7 U.S.C. 2025(c)(1)(A)(ii)")
348 pub display_text: String,
349}
350
351/// An amending action found in a bill
352#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
353pub struct BillAmendment {
354 /// Content-based ID: sha256("{bill_id}:{amending_text}")
355 /// This provides a stable, deterministic identifier that works regardless of source format.
356 pub id: String,
357
358 /// Type of action (amend, add, delete, insert, redesignate, repeal)
359 pub action_types: Vec<AmendingAction>,
360
361 /// The text of the change
362 pub amending_text: String,
363
364 /// List of word-level changes that an amendment enacts
365 pub changes: Vec<BillDiff>,
366}
367
368impl BillAmendment {
369 pub fn update_changes(&self, changes: &[BillDiff]) -> Self {
370 BillAmendment {
371 id: self.id.clone(),
372 action_types: self.action_types.clone(),
373 amending_text: self.amending_text.clone(),
374 changes: changes.to_vec(),
375 }
376 }
377}
378
379/// Actions caused by a bill amendment
380///
381/// This is designed to exist as single entries for every logical
382/// amending action. For example, given the following amending text:
383/// ```ignore
384///(B)
385/// in subsection (b)--
386///
387/// (i)
388/// by striking "specified research" and inserting "foreign research",
389///
390///
391/// (ii)
392/// by inserting "and which are attributable to foreign research (within the meaning of section 41(d)(4)(F))" before the period at the end, and
393/// ```
394/// we would annotate that with two Bill Diffs:
395/// ```ignore
396/// {
397/// "removed": ["specified"],
398/// "added": ["foreign"]
399/// }
400/// ```
401/// and
402/// ```ignore
403/// {
404/// "removed": [],
405/// "added": [
406/// "which",
407/// "attributable",
408/// "foreign",
409/// "research",
410/// "(within",
411/// "meaning",
412/// "section",
413/// "41(d)(4)(F))"
414/// ]
415///}
416/// ```
417#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
418pub struct BillDiff {
419 pub added: Vec<String>,
420 pub removed: Vec<String>,
421}
422
423/// Source Credit Attribution
424///
425/// The Source credit can contain multiple `<ref>` elements, and they are separated logically
426/// as new sources by a `;` between them in the XML. So when you encounter a `<SourceCredit>` element,
427/// you should split the element into multiple `<SourceCredit>` elements, each with a single `<ref>` element.
428///
429/// **IMPORTANT**: Source credits point to USLM ID shaped paths, for example:
430/// ```xml
431/// <sourceCredit id="id2ffb3c99-76ce-11f0-a3ab-d79a777afc56">(<ref href="/us/act/1954-08-16/ch736">Aug. 16, 1954, ch. 736</ref>, <ref href="/us/stat/68A/3">68A Stat. 3</ref>; <ref href="/us/pl/99/514/s2">Pub. L. 99–514, § 2</ref>, <date date="1986-10-22">Oct. 22, 1986</date>, <ref href="/us/stat/100/2095">100 Stat. 2095</ref>.)</sourceCredit>
432/// ```
433/// They do not actually state change information, and the source credits are not guaranteed to cover all the bills that provided changes to the document. They are better thought of as an incomplete list of pointers. While useful, it is easy to confuse these with the full, definitive listing of bills that created the Element.
434#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
435#[serde(rename_all = "snake_case")]
436pub struct SourceCredit {
437 /// The `<ref>` elements of the source credit
438 pub ref_pairs: Vec<RefPair>,
439}
440
441/// A reference pair within a source credit
442///
443/// Contains the identifier and description for a single reference within
444/// a source credit attribution.
445#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
446#[serde(rename_all = "snake_case")]
447pub struct RefPair {
448 /// The ID of the `<ref>` source credit
449 pub ref_id: String,
450 /// The description of the source credit
451 pub description: String,
452}
453
454/// Metadata and content for a single element in a USLM document
455///
456/// This struct contains all the information about a legislative element,
457/// including its position in the document hierarchy, identification paths,
458/// display information, and text content.
459///
460/// # Path Systems
461///
462/// Each element has two types of paths:
463///
464/// 1. **Structural Path** (`path`): Includes all hierarchy elements, even
465/// non-USLM ones like `Level`. Example:
466/// `uscode/title_26/subtitle_k/chapter_100/section_9834/level_1`
467///
468/// 2. **USLM ID** (`uslm_id`): Official USLM identifier following standard format.
469/// Only present for elements in the USLM scheme. Example: `/us/usc/t26/s9834/a/1`
470///
471/// Combining the structural path with the date provides a unique identifier for
472/// any element across all versions of the document.
473#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
474#[serde(rename_all = "snake_case")]
475pub struct ElementData {
476 /// The full structural path in the document for the element
477 ///
478 /// This includes all structural elements like Level that may not be part of the USLM identifier.
479 /// Note that combining this with the date field gives a unique identifier for the document
480 /// For example:
481 ///
482 /// uscode/title_26/subtitle_k/chapter_100/subchapter_c/section_9834/level_1
483 ///
484 pub path: String,
485
486 /// The type of this element in the legislative hierarchy
487 pub element_type: ElementType,
488
489 /// The type of document this element belongs to
490 pub document_type: DocumentType,
491
492 /// The date this version of the document was published
493 pub date: Date,
494
495 // Display
496 /// The raw number or identifier value (e.g., "174", "a", "1")
497 pub number_value: String,
498
499 /// The formatted display version of the number (may include prefixes/suffixes)
500 pub number_display: String,
501
502 /// A human-readable name for this element (e.g., "Section 174")
503 pub verbose_name: String,
504
505 // Content Fields
506 // These are the fields that we need to diff upon
507 /// The heading or title text of the element
508 pub heading: Option<String>,
509
510 /// The words at the start of the element that appear before any enumerated items
511 pub chapeau: Option<String>,
512
513 /// A clause imposing a qualification, condition, or restriction
514 pub proviso: Option<String>,
515
516 /// The main text content of the element
517 pub content: Option<String>,
518
519 /// Text content that appears after all child elements
520 pub continuation: Option<String>,
521
522 // Metadata
523 /// The USLM-standard identifier path for this element
524 ///
525 /// This follows the official USLM path format and excludes structural-only elements.
526 /// For example: `/us/usc/t26/s1/a/1` or `/us/pl/119-21/s1/a`
527 ///
528 /// This is computed according to USLM standards for elements that are part of the
529 /// USLM identifier scheme. If the XML provides an `identifier` attribute, it is
530 /// validated to match this generated path.
531 ///
532 /// Structural-only elements like Level will have None here, as they are not part
533 /// of the USLM identifier scheme.
534 pub uslm_id: Option<String>,
535
536 /// The USLM `id` attribute for an element
537 ///
538 /// Takes the form of a UUID, not guaranteed to exist
539 pub uslm_uuid: Option<String>,
540
541 /// Source credits and references for this element
542 pub source_credits: Vec<SourceCredit>,
543 //pub page_data: Option<PageData>, // TODO implement
544}
545
546impl ElementData {
547 /// Retrieve the text content for a specific field
548 ///
549 /// # Arguments
550 ///
551 /// * `field` - The text content field to retrieve
552 ///
553 /// # Returns
554 ///
555 /// Returns `Some(String)` if the field has content, or `None` if the field
556 /// is empty for this element.
557 pub fn get_text_content(&self, field: TextContentField) -> Option<String> {
558 match field {
559 TextContentField::Heading => self.heading.clone(),
560 TextContentField::Chapeau => self.chapeau.clone(),
561 TextContentField::Proviso => self.proviso.clone(),
562 TextContentField::Content => self.content.clone(),
563 TextContentField::Continuation => self.continuation.clone(),
564 }
565 }
566}
567
568/// A hierarchical element in a USLM document tree
569///
570/// This struct represents a single element in a legislative document along with
571/// all of its child elements, forming a tree structure that mirrors the document's
572/// hierarchical organization.
573///
574/// # Structure
575///
576/// - `data`: Contains all metadata and text content for this element
577/// - `children`: All direct child elements in document order
578///
579/// # Examples
580///
581/// A typical USC section might have a structure like:
582///
583/// ```text
584/// Section 174 (USLMElement)
585/// ├─ data: ElementData { element_type: Section, heading: "Research expenditures", ... }
586/// └─ children:
587/// ├─ Subsection (a) (USLMElement)
588/// │ └─ children: [Paragraph (1), Paragraph (2), ...]
589/// └─ Subsection (b) (USLMElement)
590/// └─ children: [...]
591/// ```
592///
593/// # Tree Navigation
594///
595/// Use the `find()` method to locate specific elements within the tree by their
596/// structural path.
597#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
598#[serde(rename_all = "snake_case")]
599pub struct USLMElement {
600 /// The metadata and content for this element
601 pub data: ElementData,
602
603 /// Child elements in document order
604 pub children: Vec<USLMElement>,
605}
606
607impl USLMElement {
608 /// Search for an element by its structural path
609 ///
610 /// Recursively searches this element and all descendants for an element
611 /// with the specified path. The path must be a fully qualified structural
612 /// path (e.g., "uscode/title_7/chapter_1/section_1").
613 ///
614 /// # Arguments
615 ///
616 /// * `path` - The full structural path of the element to find
617 ///
618 /// # Returns
619 ///
620 /// Returns `Some(&USLMElement)` if an element with the matching path is found,
621 /// or `None` if no such element exists in this tree.
622 ///
623 /// # Examples
624 ///
625 /// ```
626 /// # use words_to_data::uslm::parser::parse;
627 /// # let element = parse("tests/test_data/usc/2025-07-18/usc07.xml", "2025-07-18").unwrap();
628 /// // Find a specific section
629 /// let section = element.find("uscode/title_7/chapter_1/section_2");
630 /// assert!(section.is_some());
631 ///
632 /// // Non-existent path returns None
633 /// let missing = element.find("uscode/title_99");
634 /// assert!(missing.is_none());
635 /// ```
636 pub fn find(&self, path: &str) -> Option<&USLMElement> {
637 if path == self.data.path.as_str() {
638 return Some(self);
639 }
640 let remaining_path = path.strip_prefix(self.data.path.as_str())?;
641 let next_step: Vec<&str> = remaining_path.split("/").collect();
642 assert!(next_step.len() > 1);
643
644 let child_id = next_step[1];
645 let child_vec: Vec<&USLMElement> = self
646 .children
647 .iter()
648 .filter(|c| c.data.path.ends_with(child_id))
649 .collect();
650 if child_vec.is_empty() {
651 None
652 } else {
653 assert!(child_vec.len() == 1);
654 child_vec[0].find(path)
655 }
656 }
657
658 /// Merge the children of one node into another
659 pub fn merge_children_mut(&mut self, other: &mut USLMElement) {
660 self.children.append(&mut other.children);
661 }
662}