use std::str::FromStr;
use thiserror::Error;
use crate::{
uslm::{self, BillType, DocumentType, ElementData, ElementType, USLMElement, USLMError},
utils::load_xml_file,
};
#[derive(Error, Debug)]
pub enum ParseError {
#[error("XML parsing error: {0}")]
Xml(#[from] roxmltree::Error),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid UTF-8: {0}")]
Utf8(#[from] std::str::Utf8Error),
#[error("USLM Error")]
USLMDataError(#[from] USLMError),
#[error("Unsupported Document Type {0}")]
UnsupportedDocumentType(String),
#[error("Invalid Date")]
InvalidDate,
#[error("Serialization Error")]
SerializationError(#[from] serde_json::Error),
#[error("Unable to parse element {0}")]
UnableToParseElement(String),
#[error("Unknown element")]
UnknownElement,
#[error("Repealed element")]
RepealedElement,
#[error("Reserved element")]
ReservedElement,
}
struct TextContents {
pub heading: Option<String>,
pub chapeau: Option<String>,
pub proviso: Option<String>,
pub content: Option<String>,
pub continuation: Option<String>,
}
pub struct Number {
pub value: String,
pub display: String,
}
pub type Result<T> = std::result::Result<T, ParseError>;
fn check_attr(node: &roxmltree::Node, attr: &str, val: &str) -> bool {
match node.attribute(attr) {
None => false,
Some(s) => s == val,
}
}
fn should_include_in_uslm_path(element_type: ElementType) -> bool {
!matches!(element_type, ElementType::Level | ElementType::Unknown)
}
pub fn generate_structural_path(
element_type: ElementType,
number_value: &str,
parent_structural_path: Option<&str>,
) -> String {
let element_name = format!("{:?}", element_type).to_lowercase();
match parent_structural_path {
Some(parent) => format!("{}/{}_{}", parent, element_name, number_value),
None => format!("{}_{}", element_name, number_value),
}
}
pub fn parse(path: &str, date: &str) -> Result<USLMElement> {
let xml_str = load_xml_file(path)?;
let doc = roxmltree::Document::parse(&xml_str)?;
let top_level_node = doc
.descendants()
.find(|n| n.tag_name().name() == "uscDoc" || n.has_tag_name("pLaw"));
let document_type = match top_level_node {
None => {
return Err(ParseError::UnsupportedDocumentType(
"Can't resolve top-level document".to_string(),
));
}
Some(x) => {
if x.tag_name().name() == "pLaw" {
let preface = x.children().find(|n| n.has_tag_name("preface"));
if let Some(pref) = preface {
if let Some(doc_num) = pref.children().find(|n| n.has_tag_name("docNumber")) {
if let Some(text) = doc_num.text() {
DocumentType::Bill {
bill_type: BillType::PublicLaw,
bill_id: text.to_string(),
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing docNumber text".to_string(),
));
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing docNumber".to_string(),
));
}
} else {
return Err(ParseError::UnsupportedDocumentType(
"pLaw missing preface tag".to_string(),
));
}
} else {
let meta_tag = x.children().find(|n| n.has_tag_name("meta"));
let type_str: Option<&str> = match meta_tag {
Some(meta) => {
let dc_type = meta.children().find(|n| n.has_tag_name("type"));
dc_type.and_then(|n| n.text())
}
None => None,
};
DocumentType::from_str(x.tag_name().name(), type_str)?
}
}
};
let top_level_node = top_level_node.unwrap();
let element = parse_element(top_level_node, &document_type, date, None, None, None, 0)?;
Ok(element)
}
fn rewrap_str(s: Option<&str>) -> Option<String> {
s.map(String::from)
}
fn parse_element(
node: roxmltree::Node,
document_type: &DocumentType,
date: &str,
parent_name: Option<&str>,
parent_structural_path: Option<String>,
parent_uslm_path: Option<String>,
_depth: usize,
) -> Result<USLMElement> {
if check_attr(&node, "status", "repealed") {
return Err(ParseError::RepealedElement);
}
if check_attr(&node, "status", "reserved") {
return Err(ParseError::ReservedElement);
}
let element_type = ElementType::from_str(node.tag_name().name())
.expect("When this expect was written, all match cases were Ok()");
if matches!(element_type, ElementType::Unknown) {
return Err(ParseError::UnknownElement);
}
let xml_identifier = rewrap_str(node.attribute("identifier"));
let uslm_uuid = rewrap_str(node.attribute("id"));
let number = extract_number(element_type, &node)?;
let verbose_name = match parent_name {
None => number.display.clone(),
Some(s) => {
format!("{} {}", s, number.display.clone())
}
};
let text_contents = extract_text_contents(&node);
let structural_path = generate_structural_path(
element_type,
&number.value,
parent_structural_path.as_deref(),
);
let uslm_id = if should_include_in_uslm_path(element_type) {
match xml_identifier {
Some(xml_id) => Some(xml_id),
None => match element_type {
ElementType::PublicLawDocument => Some(format!("/us/pl/{}", number.value)),
_ => {
return Err(ParseError::UnableToParseElement(format!(
"XML identifier missing for element type {:?}",
element_type
)));
}
},
}
} else {
None
};
let d = crate::utils::date_str_to_date(date)?;
let element_data = ElementData {
path: structural_path.clone(),
uslm_id: uslm_id.clone(),
uslm_uuid,
document_type: document_type.clone(),
element_type,
date: d,
number_value: number.value,
number_display: number.display,
verbose_name: verbose_name.clone(),
heading: text_contents.heading,
chapeau: text_contents.chapeau,
proviso: text_contents.proviso,
content: text_contents.content,
continuation: text_contents.continuation,
source_credits: Vec::new(),
};
let cont_node = match matches!(element_type, uslm::ElementType::USCodeDocument) {
true => {
let main_node = node.children().find(|n| n.has_tag_name("main"));
match main_node {
Some(x) => x,
None => node,
}
}
false => node,
};
let mut children: Vec<USLMElement> = Vec::new();
for child in cont_node.children() {
let child_parent_uslm_path = uslm_id.clone().or_else(|| parent_uslm_path.clone());
let child_element = parse_element(
child,
document_type,
date,
Some(verbose_name.as_str()),
Some(structural_path.clone()),
child_parent_uslm_path,
_depth + 1,
);
match child_element {
Ok(e) => {
children.push(e);
}
Err(err) => match err {
ParseError::UnknownElement => {}
ParseError::RepealedElement => {}
ParseError::ReservedElement => {}
other => {
return Err(other);
}
},
}
}
let element = USLMElement {
data: element_data,
children,
};
Ok(element)
}
pub fn extract_number(element_type: ElementType, node: &roxmltree::Node) -> Result<Number> {
match node.children().find(|n| n.has_tag_name("num")) {
None => {
match element_type {
ElementType::USCodeDocument => {
let meta = node
.children()
.find(|n| n.has_tag_name("meta"))
.expect("meta tag should always be there");
let number = meta
.children()
.find(|n| n.has_tag_name("docNumber"))
.expect("should always be there");
Ok(Number {
value: extract_text(Some(number)).unwrap(),
display: String::new(),
})
}
ElementType::PublicLawDocument => {
let meta = node
.children()
.find(|n| n.has_tag_name("meta"))
.expect("pLaw should have meta tag");
let number = meta
.children()
.find(|n| n.has_tag_name("docNumber"))
.expect("should always be there");
let congress = meta
.children()
.find(|n| n.has_tag_name("congress"))
.expect("should always be there");
let num_val = format!(
"{}-{}",
extract_text(Some(congress)).unwrap(),
extract_text(Some(number)).unwrap()
);
Ok(Number {
value: num_val,
display: String::new(),
})
}
ElementType::Level => match node.attribute("id") {
None => Err(ParseError::UnableToParseElement(
"<Level> element has neither a <num> or <id> field".to_string(),
)),
Some(n) => Ok(Number {
value: String::from(n),
display: format!("Level {}", n),
}),
},
_ => Err(ParseError::UnableToParseElement(format!(
"'{:?}': missing <num> tag",
element_type
))),
}
}
Some(n) => {
let num_val = match n.attribute("value") {
None => String::new(),
Some(val) => String::from(val),
};
let display_val = extract_text(Some(n)).unwrap_or_default();
Ok(Number {
value: num_val,
display: display_val,
})
}
}
}
fn extract_text(node: Option<roxmltree::Node>) -> Option<String> {
match node {
None => None,
Some(n) => n.text().map(String::from),
}
}
fn extract_text_contents(node: &roxmltree::Node) -> TextContents {
TextContents {
heading: extract_text(node.children().find(|n| n.has_tag_name("heading"))),
chapeau: extract_text(node.children().find(|n| n.has_tag_name("chapeau"))),
proviso: extract_text(node.children().find(|n| n.has_tag_name("proviso"))),
content: extract_text(node.children().find(|n| n.has_tag_name("content"))),
continuation: extract_text(node.children().find(|n| n.has_tag_name("continuation"))),
}
}