pub mod batch;
pub mod bloom;
pub mod error;
pub mod index;
pub mod parallel;
pub mod persist;
#[doc(hidden)]
pub mod simd;
pub mod xpath;
pub use bloom::TagBloom;
pub use error::{Result, SimdXmlError};
pub use index::XmlIndex;
pub use persist::OwnedXmlIndex;
pub use xpath::CompiledXPath;
pub use xpath::XPathResult;
pub fn parse(input: &[u8]) -> Result<XmlIndex<'_>> {
let sample = &input[..input.len().min(4096)];
let lt_count = memchr::memchr_iter(b'<', sample).count();
let qt_count = memchr::memchr_iter(b'"', sample).count();
let quote_ratio = qt_count as f64 / lt_count.max(1) as f64;
if quote_ratio > 5.0 {
index::structural::parse_two_stage(input)
} else {
index::structural::parse_scalar(input)
}
}
pub fn parse_for_xpath<'a>(input: &'a [u8], xpath_str: &str) -> Result<XmlIndex<'a>> {
let compiled = CompiledXPath::compile(xpath_str)?;
match compiled.interesting_names() {
Some(names) => index::lazy::parse_for_query(input, &names),
None => parse(input),
}
}
pub fn load_or_parse(xml_path: impl AsRef<std::path::Path>) -> Result<OwnedXmlIndex> {
let xml_path = xml_path.as_ref();
let sxi_path = xml_path.with_extension("sxi");
if sxi_path.exists() {
match persist::load_index(&sxi_path, xml_path) {
Ok(owned) => return Ok(owned),
Err(SimdXmlError::StaleSxi) => { }
Err(e) => return Err(e),
}
}
let xml_bytes = std::fs::read(xml_path)?;
let mut index = parse(&xml_bytes)?;
index.build_name_index();
persist::serialize_index(&index, &xml_bytes, &sxi_path)?;
persist::load_index_with_bytes(&sxi_path, xml_bytes)
}
impl<'a> XmlIndex<'a> {
pub fn eval(&mut self, xpath_expr: &str) -> Result<xpath::XPathResult> {
let expr = xpath::parse_xpath(xpath_expr)?;
if Self::needs_indices(&expr) {
self.ensure_indices();
}
match &expr {
xpath::XPathExpr::LocationPath(ref path) if !path.absolute => {
if let Some(doc_elem) = self.document_element() {
let nodes = xpath::evaluate_from_context(
self, &expr, xpath::XPathNode::Element(doc_elem))?;
return Ok(xpath::XPathResult::NodeSet(nodes));
}
}
_ => {}
}
xpath::eval_xpath(self, &expr)
}
pub fn xpath_raw(&'a self, xpath_expr: &str) -> Result<Vec<&'a str>> {
let nodes = self.xpath(xpath_expr)?;
Ok(nodes.iter().map(|node| match *node {
xpath::XPathNode::Element(idx) => self.raw_xml(idx),
xpath::XPathNode::Text(idx) => self.text_content(&self.text_ranges[idx]),
xpath::XPathNode::Attribute(tag_idx, _) | xpath::XPathNode::Namespace(tag_idx, _) => {
self.raw_tag(tag_idx)
}
}).collect())
}
pub fn xpath_string(&self, xpath_expr: &str) -> Result<Vec<String>> {
let nodes = self.xpath(xpath_expr)?;
let mut results = Vec::with_capacity(nodes.len());
for node in &nodes {
match *node {
xpath::XPathNode::Element(idx) => {
results.push(self.all_text(idx));
}
xpath::XPathNode::Text(idx) => {
results.push(self.text_content(&self.text_ranges[idx]).to_string());
}
xpath::XPathNode::Attribute(tag_idx, _) => {
let raw = self.raw_tag(tag_idx);
results.push(raw.to_string());
}
xpath::XPathNode::Namespace(_, _) => {}
}
}
Ok(results)
}
pub fn xpath_text(&'a self, xpath_expr: &str) -> Result<Vec<&'a str>> {
let expr = xpath::parse_xpath(xpath_expr)?;
let nodes = self.eval_with_doc_context(&expr)?;
xpath::extract_text(self, &nodes)
}
pub fn xpath(&self, xpath_expr: &str) -> Result<Vec<xpath::XPathNode>> {
let expr = xpath::parse_xpath(xpath_expr)?;
self.eval_with_doc_context(&expr)
}
fn eval_with_doc_context(&self, expr: &xpath::XPathExpr) -> Result<Vec<xpath::XPathNode>> {
match expr {
xpath::XPathExpr::LocationPath(ref path) if !path.absolute => {
if let Some(doc_elem) = self.document_element() {
return xpath::evaluate_from_context(
self, expr, xpath::XPathNode::Element(doc_elem));
}
xpath::evaluate(self, expr)
}
xpath::XPathExpr::Union(ref exprs) => {
let mut result = Vec::new();
for e in exprs {
result.extend(self.eval_with_doc_context(e)?);
}
result.sort_by_key(|n| match n {
xpath::XPathNode::Element(i) => (*i, 0u32),
xpath::XPathNode::Text(i) => (*i, 1),
xpath::XPathNode::Attribute(i, _) => (*i, 2),
xpath::XPathNode::Namespace(i, _) => (*i, 3),
});
result.dedup_by(|a, b| match (a, b) {
(xpath::XPathNode::Element(i), xpath::XPathNode::Element(j)) => i == j,
(xpath::XPathNode::Text(i), xpath::XPathNode::Text(j)) => i == j,
_ => false,
});
Ok(result)
}
_ => xpath::evaluate(self, expr),
}
}
fn needs_indices(expr: &xpath::XPathExpr) -> bool {
match expr {
xpath::XPathExpr::LocationPath(path) => path.steps.len() > 1,
xpath::XPathExpr::Union(exprs) => exprs.iter().any(Self::needs_indices),
xpath::XPathExpr::FilterPath(inner, _) => Self::needs_indices(inner),
xpath::XPathExpr::GlobalFilter(inner, _) => Self::needs_indices(inner),
xpath::XPathExpr::FunctionCall(_, args) => args.iter().any(Self::needs_indices),
_ => false,
}
}
fn document_element(&self) -> Option<usize> {
(0..self.tag_count()).find(|&i| {
self.depths[i] == 0
&& (self.tag_types[i] == index::TagType::Open
|| self.tag_types[i] == index::TagType::SelfClose)
})
}
pub fn eval_expr(&self, expr_str: &str) -> Result<xpath::StandaloneResult> {
xpath::eval_expr_with_doc(self, expr_str)
}
pub fn eval_expr_from(&self, expr_str: &str, context_idx: usize) -> Result<xpath::StandaloneResult> {
xpath::eval_expr_with_context(self, expr_str, xpath::XPathNode::Element(context_idx))
}
pub fn xpath_from(&self, xpath_expr: &str, context_idx: usize) -> Result<Vec<xpath::XPathNode>> {
let expr = xpath::parse_xpath(xpath_expr)?;
let context_node = xpath::XPathNode::Element(context_idx);
xpath::evaluate_from_context(self, &expr, context_node)
}
}