use std::str::FromStr;
use thiserror::Error;
use crate::{
io::load_xml_file,
uslm::{
self, BillType, DocumentType, ElementData, ElementType, RefPair, SourceCredit, USLMElement,
USLMError, path::should_include_in_uslm_path,
},
};
pub use crate::uslm::path::generate_structural_path;
#[derive(Error, Debug)]
pub enum ParseError {
#[error("XML parsing error: {0}")]
Xml(#[from] roxmltree::Error),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid UTF-8: {0}")]
Utf8(#[from] std::str::Utf8Error),
#[error("USLM Error")]
USLMDataError(#[from] USLMError),
#[error("Unsupported Document Type {0}")]
UnsupportedDocumentType(String),
#[error("Invalid Date")]
InvalidDate,
#[error("Serialization Error")]
SerializationError(#[from] serde_json::Error),
#[error("Unable to parse element {0}")]
UnableToParseElement(String),
#[error("Unknown element")]
UnknownElement,
#[error("Repealed element")]
RepealedElement,
#[error("Reserved element")]
ReservedElement,
}
struct TextContents {
pub heading: Option<String>,
pub chapeau: Option<String>,
pub proviso: Option<String>,
pub content: Option<String>,
pub continuation: Option<String>,
}
pub struct Number {
pub value: String,
pub display: String,
}
pub type Result<T> = std::result::Result<T, ParseError>;
fn check_attr(node: &roxmltree::Node, attr: &str, val: &str) -> bool {
match node.attribute(attr) {
None => false,
Some(s) => s == val,
}
}
pub fn parse_from_str(xml_str: &str, date: &str) -> Result<USLMElement> {
let doc = roxmltree::Document::parse(xml_str)?;
let top_level_node = doc
.descendants()
.find(|n| n.tag_name().name() == "uscDoc" || n.has_tag_name("pLaw"));
let document_type = match top_level_node {
None => {
return Err(ParseError::UnsupportedDocumentType(
"Can't resolve top-level document".to_string(),
));
}
Some(x) => {
if x.tag_name().name() == "pLaw" {
let preface = x.children().find(|n| n.has_tag_name("preface"));
if let Some(pref) = preface {
if let Some(doc_num) = pref.children().find(|n| n.has_tag_name("docNumber")) {
if let Some(text) = doc_num.text() {
DocumentType::Bill {
bill_type: BillType::PublicLaw,
bill_id: text.to_string(),
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing docNumber text".to_string(),
));
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing docNumber".to_string(),
));
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing preface tag".to_string(),
));
}
} else {
let meta_tag = x.children().find(|n| n.has_tag_name("meta"));
let type_str: Option<&str> = match meta_tag {
Some(meta) => {
let dc_type = meta.children().find(|n| n.has_tag_name("type"));
dc_type.and_then(|n| n.text())
}
None => None,
};
DocumentType::from_str(x.tag_name().name(), type_str)?
}
}
};
let top_level_node = top_level_node.unwrap();
let element = parse_element(top_level_node, &document_type, date, None, None, None, 0)?;
Ok(element)
}
pub fn parse(path: &str, date: &str) -> Result<USLMElement> {
let xml_str = load_xml_file(path)?;
parse_from_str(&xml_str, date)
}
fn rewrap_str(s: Option<&str>) -> Option<String> {
s.map(String::from)
}
fn extract_source_credits(node: &roxmltree::Node) -> Vec<SourceCredit> {
let source_credit_nodes = node.children().filter(|n| n.has_tag_name("sourceCredit"));
let mut result = Vec::new();
for sc_node in source_credit_nodes {
let mut current_refs = Vec::new();
for descendant in sc_node.descendants() {
if descendant.has_tag_name("ref") {
if let Some(href) = descendant.attribute("href") {
let description = descendant.text().unwrap_or("").to_string();
current_refs.push(RefPair {
ref_id: href.to_string(),
description,
});
}
} else if let Some(text) = descendant.text()
&& text.contains(';')
{
if !current_refs.is_empty() {
result.push(SourceCredit {
ref_pairs: current_refs.clone(),
});
current_refs.clear();
}
}
}
if !current_refs.is_empty() {
result.push(SourceCredit {
ref_pairs: current_refs,
});
}
}
result
}
fn parse_element(
node: roxmltree::Node,
document_type: &DocumentType,
date: &str,
parent_name: Option<&str>,
parent_structural_path: Option<String>,
parent_uslm_path: Option<String>,
_depth: usize,
) -> Result<USLMElement> {
if check_attr(&node, "status", "repealed") {
return Err(ParseError::RepealedElement);
}
if check_attr(&node, "status", "reserved") {
return Err(ParseError::ReservedElement);
}
let element_type = ElementType::from_str(node.tag_name().name())
.expect("When this expect was written, all match cases were Ok()");
if matches!(element_type, ElementType::Unknown) {
return Err(ParseError::UnknownElement);
}
let xml_identifier = rewrap_str(node.attribute("identifier"));
let uslm_uuid = rewrap_str(node.attribute("id"));
let number = extract_number(element_type, &node)?;
let verbose_name = match parent_name {
None => number.display.clone(),
Some(s) => {
format!("{} {}", s, number.display.clone())
}
};
let text_contents = extract_text_contents(&node);
let structural_path = generate_structural_path(
element_type,
&number.value,
parent_structural_path.as_deref(),
);
let uslm_id = if should_include_in_uslm_path(element_type) {
match xml_identifier {
Some(xml_id) => Some(xml_id),
None => match element_type {
ElementType::PublicLawDocument => Some(format!("/us/pl/{}", number.value)),
_ => {
return Err(ParseError::UnableToParseElement(format!(
"XML identifier missing for element type {:?}",
element_type
)));
}
},
}
} else {
None
};
let d = crate::date::date_str_to_date(date)?;
let source_credits = extract_source_credits(&node);
let element_data = ElementData {
path: structural_path.clone(),
uslm_id: uslm_id.clone(),
uslm_uuid,
document_type: document_type.clone(),
element_type,
date: d,
number_value: number.value,
number_display: number.display,
verbose_name: verbose_name.clone(),
heading: text_contents.heading,
chapeau: text_contents.chapeau,
proviso: text_contents.proviso,
content: text_contents.content,
continuation: text_contents.continuation,
source_credits,
};
let cont_node = match matches!(element_type, uslm::ElementType::USCodeDocument) {
true => {
let main_node = node.children().find(|n| n.has_tag_name("main"));
match main_node {
Some(x) => x,
None => node,
}
}
false => node,
};
let mut children: Vec<USLMElement> = Vec::new();
for child in cont_node.children() {
let child_parent_uslm_path = uslm_id.clone().or_else(|| parent_uslm_path.clone());
let child_element = parse_element(
child,
document_type,
date,
Some(verbose_name.as_str()),
Some(structural_path.clone()),
child_parent_uslm_path,
_depth + 1,
);
match child_element {
Ok(e) => {
children.push(e);
}
Err(err) => match err {
ParseError::UnknownElement => {}
ParseError::RepealedElement => {}
ParseError::ReservedElement => {}
other => {
return Err(other);
}
},
}
}
let element = USLMElement {
data: element_data,
children,
};
Ok(element)
}
pub fn extract_number(element_type: ElementType, node: &roxmltree::Node) -> Result<Number> {
match node.children().find(|n| n.has_tag_name("num")) {
None => {
match element_type {
ElementType::USCodeDocument => {
let meta = node
.children()
.find(|n| n.has_tag_name("meta"))
.expect("meta tag should always be there");
let number = meta
.children()
.find(|n| n.has_tag_name("docNumber"))
.expect("should always be there");
Ok(Number {
value: extract_text(Some(number)).unwrap(),
display: String::new(),
})
}
ElementType::PublicLawDocument => {
let meta = node
.children()
.find(|n| n.has_tag_name("meta"))
.expect("pLaw should have meta tag");
let number = meta
.children()
.find(|n| n.has_tag_name("docNumber"))
.expect("should always be there");
let congress = meta
.children()
.find(|n| n.has_tag_name("congress"))
.expect("should always be there");
let num_val = format!(
"{}-{}",
extract_text(Some(congress)).unwrap(),
extract_text(Some(number)).unwrap()
);
Ok(Number {
value: num_val,
display: String::new(),
})
}
ElementType::Level => match node.attribute("id") {
None => Err(ParseError::UnableToParseElement(
"<Level> element has neither a <num> or <id> field".to_string(),
)),
Some(n) => Ok(Number {
value: String::from(n),
display: format!("Level {}", n),
}),
},
_ => Err(ParseError::UnableToParseElement(format!(
"'{:?}': missing <num> tag",
element_type
))),
}
}
Some(n) => {
let num_val = match n.attribute("value") {
None => String::new(),
Some(val) => String::from(val),
};
let display_val = extract_text(Some(n)).unwrap_or_default();
Ok(Number {
value: num_val,
display: display_val,
})
}
}
}
fn extract_text(node: Option<roxmltree::Node>) -> Option<String> {
match node {
None => None,
Some(n) => n.text().map(String::from),
}
}
fn extract_text_contents(node: &roxmltree::Node) -> TextContents {
TextContents {
heading: extract_text(node.children().find(|n| n.has_tag_name("heading"))),
chapeau: extract_text(node.children().find(|n| n.has_tag_name("chapeau"))),
proviso: extract_text(node.children().find(|n| n.has_tag_name("proviso"))),
content: extract_text(node.children().find(|n| n.has_tag_name("content"))),
continuation: extract_text(node.children().find(|n| n.has_tag_name("continuation"))),
}
}