#[doc(hidden)]
pub mod lazy;
#[doc(hidden)]
pub mod structural;
pub(crate) mod tags;
pub struct XmlIndex<'a> {
pub(crate) input: &'a [u8],
pub tag_starts: Vec<u64>,
pub tag_ends: Vec<u64>,
pub(crate) tag_types: Vec<TagType>,
pub(crate) tag_names: Vec<(u64, u16)>,
pub(crate) depths: Vec<u16>,
pub parents: Vec<u32>,
pub text_ranges: Vec<TextRange>,
pub(crate) child_offsets: Vec<u32>,
pub(crate) child_data: Vec<u32>,
pub(crate) text_child_offsets: Vec<u32>,
pub(crate) text_child_data: Vec<u32>,
pub(crate) close_map: Vec<u32>,
pub(crate) post_order: Vec<u32>,
pub name_ids: Vec<u16>,
pub name_table: Vec<(u64, u16)>,
pub name_posting: Vec<Vec<u32>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum TagType {
Open = 0,
Close = 1,
SelfClose = 2,
Comment = 3,
CData = 4,
PI = 5,
}
impl TagType {
pub fn from_u8(v: u8) -> Option<Self> {
match v {
0 => Some(TagType::Open),
1 => Some(TagType::Close),
2 => Some(TagType::SelfClose),
3 => Some(TagType::Comment),
4 => Some(TagType::CData),
5 => Some(TagType::PI),
_ => None,
}
}
}
pub(crate) struct NameInterner<'a> {
input: &'a [u8],
table: Vec<(u64, u16)>,
map: Option<std::collections::HashMap<u64, u16>>,
}
impl<'a> NameInterner<'a> {
pub fn new(input: &'a [u8]) -> Self {
Self { input, table: Vec::with_capacity(64), map: None }
}
#[inline]
fn fnv1a(bytes: &[u8]) -> u64 {
let mut h: u64 = 0xcbf29ce484222325;
for &b in bytes { h ^= b as u64; h = h.wrapping_mul(0x100000001b3); }
h
}
#[inline]
pub fn intern(&mut self, name_bytes: &[u8], offset: u64, len: u16) -> u16 {
if let Some(ref map) = self.map {
let hash = Self::fnv1a(name_bytes);
if let Some(&id) = map.get(&hash) {
let (off, l) = self.table[id as usize];
if l == len && &self.input[off as usize..off as usize + l as usize] == name_bytes {
return id;
}
for (i, &(off2, l2)) in self.table.iter().enumerate() {
if l2 == len && &self.input[off2 as usize..off2 as usize + l2 as usize] == name_bytes {
return i as u16;
}
}
let new_id = self.table.len().min(u16::MAX as usize) as u16;
self.table.push((offset, len));
return new_id;
}
let id = self.table.len().min(u16::MAX as usize) as u16;
self.table.push((offset, len));
self.map.as_mut().unwrap().insert(hash, id);
return id;
}
for (id, &(off, l)) in self.table.iter().enumerate() {
if l == len && &self.input[off as usize..off as usize + l as usize] == name_bytes {
return id as u16;
}
}
let id = self.table.len() as u16;
self.table.push((offset, len));
if self.table.len() == 256 {
let mut map = std::collections::HashMap::with_capacity(512);
for (i, &(off, l)) in self.table.iter().enumerate() {
let bytes = &self.input[off as usize..off as usize + l as usize];
map.insert(Self::fnv1a(bytes), i as u16);
}
self.map = Some(map);
}
id
}
pub fn into_table(self) -> Vec<(u64, u16)> {
self.table
}
}
#[derive(Debug, Clone, Copy)]
#[repr(C)]
pub struct TextRange {
pub start: u64,
pub end: u64,
pub parent_tag: u32,
}
impl<'a> XmlIndex<'a> {
pub fn ensure_indices(&mut self) {
if !self.has_indices() && self.tag_count() >= 1 {
self.build_indices();
}
}
pub(crate) fn build_indices(&mut self) {
let n = self.tag_count();
let need_close_map = self.close_map.is_empty();
if n < 10_000 {
let (co, cd) = build_csr_children(&self.tag_types, &self.parents, n);
let (tco, tcd) = build_csr_text_children(&self.text_ranges, n);
self.child_offsets = co;
self.child_data = cd;
self.text_child_offsets = tco;
self.text_child_data = tcd;
if need_close_map {
let (cm, po) = build_close_map_and_post_order(&self.tag_types, n);
self.close_map = cm;
self.post_order = po;
}
return;
}
let tag_types = &self.tag_types;
let parents = &self.parents;
let text_ranges = &self.text_ranges;
let (child_offsets, child_data, text_child_offsets, text_child_data,
close_map, post_order) = std::thread::scope(|scope| {
let csr_children = scope.spawn(move || {
build_csr_children(tag_types, parents, n)
});
let csr_text = scope.spawn(move || {
build_csr_text_children(text_ranges, n)
});
let (cm, po) = if need_close_map {
build_close_map_and_post_order(tag_types, n)
} else {
(Vec::new(), Vec::new())
};
let (co, cd) = csr_children.join().unwrap();
let (tco, tcd) = csr_text.join().unwrap();
(co, cd, tco, tcd, cm, po)
});
self.child_offsets = child_offsets;
self.child_data = child_data;
self.text_child_offsets = text_child_offsets;
self.text_child_data = text_child_data;
if need_close_map {
self.close_map = close_map;
self.post_order = post_order;
}
}
#[inline]
pub(crate) fn child_tag_slice(&self, parent_idx: usize) -> &[u32] {
if self.child_offsets.len() < 2 || parent_idx + 1 >= self.child_offsets.len() {
return &[];
}
let start = self.child_offsets[parent_idx] as usize;
let end = self.child_offsets[parent_idx + 1] as usize;
&self.child_data[start..end]
}
#[inline]
pub fn child_text_slice(&self, parent_idx: usize) -> &[u32] {
if self.text_child_offsets.len() < 2 || parent_idx + 1 >= self.text_child_offsets.len() {
return &[];
}
let start = self.text_child_offsets[parent_idx] as usize;
let end = self.text_child_offsets[parent_idx + 1] as usize;
&self.text_child_data[start..end]
}
#[inline]
pub(crate) fn has_indices(&self) -> bool {
!self.child_offsets.is_empty()
}
#[inline]
pub(crate) fn is_ancestor(&self, ancestor_idx: usize, descendant_idx: usize) -> bool {
if self.post_order.is_empty() { return false; }
ancestor_idx < descendant_idx
&& self.post_order[ancestor_idx] > self.post_order[descendant_idx]
}
pub fn build_name_index(&mut self) {
if !self.name_posting.is_empty() { return; }
let n = self.tag_count();
let mut interner = NameInterner::new(self.input);
self.name_ids = Vec::with_capacity(n);
for i in 0..n {
let (off, len) = self.tag_names[i];
if len > 0 {
let name_bytes = &self.input[off as usize..off as usize + len as usize];
self.name_ids.push(interner.intern(name_bytes, off, len));
} else {
self.name_ids.push(u16::MAX);
}
}
self.name_table = interner.into_table();
let num_names = self.name_table.len();
let mut posting: Vec<Vec<u32>> = vec![Vec::new(); num_names];
for i in 0..n {
let nid = self.name_ids[i];
if nid != u16::MAX && (nid as usize) < num_names {
let tt = self.tag_types[i];
if tt == TagType::Open || tt == TagType::SelfClose {
posting[nid as usize].push(i as u32);
}
}
}
self.name_posting = posting;
}
#[inline]
pub fn name_id(&self, name: &str) -> Option<u16> {
let name_bytes = name.as_bytes();
for (id, &(off, len)) in self.name_table.iter().enumerate() {
if len as usize == name_bytes.len()
&& &self.input[off as usize..off as usize + len as usize] == name_bytes
{
return Some(id as u16);
}
}
None
}
#[inline]
pub fn tags_by_name(&self, name: &str) -> &[u32] {
if let Some(id) = self.name_id(name) {
if (id as usize) < self.name_posting.len() {
return &self.name_posting[id as usize];
}
}
&[]
}
#[inline(always)]
pub fn tag_name_eq(&self, tag_idx: usize, name: &str) -> bool {
if tag_idx >= self.tag_names.len() { return false; }
let (off, len) = self.tag_names[tag_idx];
let name_bytes = name.as_bytes();
if name_bytes.len() != len as usize { return false; }
&self.input[off as usize..off as usize + len as usize] == name_bytes
}
#[inline]
pub fn tag_name(&self, tag_idx: usize) -> &'a str {
if tag_idx >= self.tag_names.len() {
return "";
}
let (offset, len) = self.tag_names[tag_idx];
let bytes = &self.input[offset as usize..(offset + len as u64) as usize];
unsafe { std::str::from_utf8_unchecked(bytes) }
}
#[inline]
pub fn text_by_index(&self, text_idx: usize) -> &'a str {
self.text_content(&self.text_ranges[text_idx])
}
#[inline]
pub fn text_content(&self, range: &TextRange) -> &'a str {
let bytes = &self.input[range.start as usize..range.end as usize];
unsafe { std::str::from_utf8_unchecked(bytes) }
}
pub fn decode_entities(s: &str) -> std::borrow::Cow<'_, str> {
if !s.contains('&') {
return std::borrow::Cow::Borrowed(s);
}
let mut result = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '&' {
let mut entity = String::new();
for ec in chars.by_ref() {
if ec == ';' { break; }
entity.push(ec);
}
match entity.as_str() {
"amp" => result.push('&'),
"lt" => result.push('<'),
"gt" => result.push('>'),
"apos" => result.push('\''),
"quot" => result.push('"'),
e if e.starts_with('#') => {
let num = &e[1..];
let code = if let Some(hex) = num.strip_prefix('x') {
u32::from_str_radix(hex, 16).ok()
} else {
num.parse::<u32>().ok()
};
if let Some(ch) = code.and_then(char::from_u32) {
result.push(ch);
}
}
_ => { result.push('&'); result.push_str(&entity); result.push(';'); }
}
} else {
result.push(c);
}
}
std::borrow::Cow::Owned(result)
}
pub fn tag_count(&self) -> usize {
self.tag_starts.len()
}
pub fn text_count(&self) -> usize {
self.text_ranges.len()
}
#[inline]
pub fn tag_type(&self, idx: usize) -> TagType {
self.tag_types[idx]
}
#[inline]
pub fn depth(&self, idx: usize) -> u16 {
self.depths[idx]
}
pub fn max_depth(&self) -> u16 {
self.depths.iter().copied().max().unwrap_or(0)
}
pub fn matching_close(&self, open_idx: usize) -> Option<usize> {
if open_idx >= self.tag_count() {
return None;
}
if !self.close_map.is_empty() {
let close = self.close_map[open_idx];
return if close != u32::MAX { Some(close as usize) } else { None };
}
if self.tag_types[open_idx] == TagType::SelfClose {
return Some(open_idx);
}
if self.tag_types[open_idx] != TagType::Open {
return None;
}
let depth = self.depths[open_idx];
let name = self.tag_name(open_idx);
for i in (open_idx + 1)..self.tag_count() {
if self.tag_types[i] == TagType::Close
&& self.depths[i] == depth
&& self.tag_name(i) == name
{
return Some(i);
}
}
None
}
pub fn children(&self, parent_idx: usize) -> Vec<usize> {
if self.has_indices() {
self.child_tag_slice(parent_idx).iter().map(|&i| i as usize).collect()
} else {
(0..self.tag_count())
.filter(|&i| self.parents[i] == parent_idx as u32
&& (self.tag_types[i] == TagType::Open || self.tag_types[i] == TagType::SelfClose))
.collect()
}
}
pub fn direct_text(&self, tag_idx: usize) -> Vec<&'a str> {
if self.has_indices() {
self.child_text_slice(tag_idx).iter()
.map(|&ti| self.text_content(&self.text_ranges[ti as usize]))
.collect()
} else {
self.text_ranges.iter()
.filter(|r| r.parent_tag == tag_idx as u32)
.map(|r| self.text_content(r))
.collect()
}
}
pub fn raw_xml(&self, tag_idx: usize) -> &'a str {
let start = self.tag_starts[tag_idx] as usize;
if self.tag_types[tag_idx] == TagType::SelfClose {
let end = self.tag_ends[tag_idx] as usize + 1;
return unsafe { std::str::from_utf8_unchecked(&self.input[start..end]) };
}
if let Some(close_idx) = self.matching_close(tag_idx) {
let end = self.tag_ends[close_idx] as usize + 1;
return unsafe { std::str::from_utf8_unchecked(&self.input[start..end]) };
}
let end = self.tag_ends[tag_idx] as usize + 1;
unsafe { std::str::from_utf8_unchecked(&self.input[start..end]) }
}
pub fn raw_tag(&self, tag_idx: usize) -> &'a str {
let start = self.tag_starts[tag_idx] as usize;
let end = self.tag_ends[tag_idx] as usize + 1;
unsafe { std::str::from_utf8_unchecked(&self.input[start..end]) }
}
pub fn all_text(&self, tag_idx: usize) -> String {
let close_idx = self.matching_close(tag_idx).unwrap_or(tag_idx);
let tag_start = self.tag_starts[tag_idx];
let tag_end = if close_idx == tag_idx {
self.tag_ends[tag_idx]
} else {
self.tag_starts[close_idx]
};
let mut result = String::new();
let start_idx = self.text_ranges
.partition_point(|r| r.start < tag_start);
for range in &self.text_ranges[start_idx..] {
if range.start > tag_end { break; }
if range.end <= tag_end {
result.push_str(self.text_content(range));
}
}
result
}
pub fn parent(&self, tag_idx: usize) -> Option<usize> {
if tag_idx >= self.parents.len() {
return None;
}
let p = self.parents[tag_idx];
if p == u32::MAX { None } else { Some(p as usize) }
}
pub fn child_position(&self, tag_idx: usize) -> Option<usize> {
let parent = self.parent(tag_idx)?;
if self.has_indices() {
let slice = self.child_tag_slice(parent);
slice.iter().position(|&c| c as usize == tag_idx)
} else {
let children = self.children(parent);
children.iter().position(|&c| c == tag_idx)
}
}
pub fn child_slice(&self, parent_idx: usize) -> &[u32] {
self.child_tag_slice(parent_idx)
}
pub fn child_count(&self, parent_idx: usize) -> usize {
if self.has_indices() {
self.child_tag_slice(parent_idx).len()
} else {
self.children(parent_idx).len()
}
}
pub fn child_at(&self, parent_idx: usize, pos: usize) -> Option<usize> {
if self.has_indices() {
let slice = self.child_tag_slice(parent_idx);
slice.get(pos).map(|&c| c as usize)
} else {
let children = self.children(parent_idx);
children.get(pos).copied()
}
}
pub fn direct_text_first(&self, tag_idx: usize) -> Option<&'a str> {
if self.has_indices() {
let text_children = self.child_text_slice(tag_idx);
if let Some(&first_idx) = text_children.first() {
let range = &self.text_ranges[first_idx as usize];
let text = self.text_content(range);
if !text.is_empty() {
return Some(text);
}
}
None
} else {
for range in &self.text_ranges {
if range.parent_tag == tag_idx as u32 {
let text = self.text_content(range);
if !text.is_empty() {
return Some(text);
}
}
}
None
}
}
pub fn tail_text(&self, tag_idx: usize) -> Option<&'a str> {
let parent = self.parent(tag_idx)?;
let close_idx = self.matching_close(tag_idx).unwrap_or(tag_idx);
let close_end = self.tag_ends[close_idx] + 1;
let start_idx = self
.text_ranges
.partition_point(|r| r.start < close_end);
for range in &self.text_ranges[start_idx..] {
if range.start > close_end {
break; }
if range.start == close_end && range.parent_tag == parent as u32 {
let text = self.text_content(range);
if !text.is_empty() {
return Some(text);
}
}
}
None
}
pub fn itertext_collect(&self, tag_idx: usize) -> Vec<&'a str> {
let start_offset = self.tag_starts[tag_idx];
let close_idx = self.matching_close(tag_idx).unwrap_or(tag_idx);
let end_offset = self.tag_ends[close_idx] + 1;
let first = self.text_ranges.partition_point(|r| r.start < start_offset);
let mut result = Vec::new();
for range in &self.text_ranges[first..] {
if range.start >= end_offset {
break;
}
let text = self.text_content(range);
if !text.is_empty() {
result.push(text);
}
}
result
}
pub fn canonicalize(&self, tag_idx: usize) -> String {
let mut out = String::new();
self.c14n_element(tag_idx, &mut out);
out
}
fn c14n_element(&self, tag_idx: usize, out: &mut String) {
let tag = self.tag_name(tag_idx);
out.push('<');
out.push_str(tag);
let mut attrs = self.get_all_attribute_names(tag_idx)
.into_iter()
.filter_map(|name| {
self.get_attribute(tag_idx, name).map(|val| (name, val))
})
.collect::<Vec<_>>();
attrs.sort_by_key(|&(name, _)| name);
for (name, val) in &attrs {
out.push(' ');
out.push_str(name);
out.push_str("=\"");
c14n_escape_attr(val, out);
out.push('"');
}
out.push('>');
if let Some(text) = self.direct_text_first(tag_idx) {
c14n_escape_text(text, out);
}
for &child in self.child_slice(tag_idx) {
let child = child as usize;
self.c14n_element(child, out);
if let Some(tail) = self.tail_text(child) {
c14n_escape_text(tail, out);
}
}
out.push_str("</");
out.push_str(tag);
out.push('>');
}
}
fn c14n_escape_text(text: &str, out: &mut String) {
for c in text.chars() {
match c {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
'\r' => out.push_str("
"),
_ => out.push(c),
}
}
}
fn c14n_escape_attr(text: &str, out: &mut String) {
for c in text.chars() {
match c {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'"' => out.push_str("""),
'\t' => out.push_str("	"),
'\n' => out.push_str("
"),
'\r' => out.push_str("
"),
_ => out.push(c),
}
}
}
pub(crate) fn build_csr_children(
tag_types: &[TagType],
parents: &[u32],
n: usize,
) -> (Vec<u32>, Vec<u32>) {
let mut child_counts = vec![0u32; n + 1];
for i in 0..n {
let tt = tag_types[i];
if tt == TagType::Close || tt == TagType::CData {
continue;
}
let parent = parents[i];
if parent != u32::MAX && (parent as usize) < n {
child_counts[parent as usize] += 1;
}
}
let mut child_offsets = vec![0u32; n + 1];
for i in 0..n {
child_offsets[i + 1] = child_offsets[i] + child_counts[i];
}
let total_children = child_offsets[n] as usize;
let mut child_data = vec![0u32; total_children];
let mut write_pos = child_offsets.clone();
for i in 0..n {
let tt = tag_types[i];
if tt == TagType::Close || tt == TagType::CData {
continue;
}
let parent = parents[i];
if parent != u32::MAX && (parent as usize) < n {
let p = parent as usize;
child_data[write_pos[p] as usize] = i as u32;
write_pos[p] += 1;
}
}
(child_offsets, child_data)
}
pub(crate) fn build_csr_text_children(
text_ranges: &[TextRange],
n: usize,
) -> (Vec<u32>, Vec<u32>) {
let mut text_counts = vec![0u32; n + 1];
for range in text_ranges {
let parent = range.parent_tag;
if parent != u32::MAX && (parent as usize) < n {
text_counts[parent as usize] += 1;
}
}
let mut text_child_offsets = vec![0u32; n + 1];
for i in 0..n {
text_child_offsets[i + 1] = text_child_offsets[i] + text_counts[i];
}
let total_text = text_child_offsets[n] as usize;
let mut text_child_data = vec![0u32; total_text];
let mut text_write_pos = text_child_offsets.clone();
for (ti, range) in text_ranges.iter().enumerate() {
let parent = range.parent_tag;
if parent != u32::MAX && (parent as usize) < n {
let p = parent as usize;
text_child_data[text_write_pos[p] as usize] = ti as u32;
text_write_pos[p] += 1;
}
}
(text_child_offsets, text_child_data)
}
fn build_close_map_and_post_order(
tag_types: &[TagType],
n: usize,
) -> (Vec<u32>, Vec<u32>) {
let mut close_map = vec![u32::MAX; n];
let mut post_order = vec![0u32; n];
let mut stack: Vec<usize> = Vec::new();
let mut post_counter: u32 = 0;
for i in 0..n {
match tag_types[i] {
TagType::Open => {
stack.push(i);
}
TagType::Close => {
if let Some(open_idx) = stack.pop() {
close_map[open_idx] = i as u32;
post_order[open_idx] = post_counter;
}
post_order[i] = post_counter;
post_counter += 1;
}
TagType::SelfClose => {
close_map[i] = i as u32;
post_order[i] = post_counter;
post_counter += 1;
}
TagType::Comment | TagType::PI | TagType::CData => {
post_order[i] = post_counter;
post_counter += 1;
}
}
}
(close_map, post_order)
}