use crate::error::Result;
use crate::xpath::CompiledXPath;
pub fn eval_batch_text(
docs: &[&[u8]],
xpath: &CompiledXPath,
) -> Result<Vec<Vec<String>>> {
let mut all_results = Vec::with_capacity(docs.len());
for &doc in docs {
let index = crate::parse(doc)?;
let texts: Vec<String> = xpath.eval_text(&index)?
.into_iter().map(|s| s.to_string()).collect();
all_results.push(texts);
}
Ok(all_results)
}
pub fn eval_batch_text_lazy(
docs: &[&[u8]],
xpath: &CompiledXPath,
) -> Result<Vec<Vec<String>>> {
let interesting = xpath.interesting_names();
let mut all_results = Vec::with_capacity(docs.len());
for &doc in docs {
let index = match &interesting {
Some(names) => crate::index::lazy::parse_for_query(doc, names)?,
None => crate::parse(doc)?,
};
let texts: Vec<String> = xpath.eval_text(&index)?
.into_iter().map(|s| s.to_string()).collect();
all_results.push(texts);
}
Ok(all_results)
}
pub fn eval_batch_text_bloom(
docs: &[&[u8]],
xpath: &CompiledXPath,
) -> Result<Vec<Vec<String>>> {
let interesting = xpath.interesting_names();
let target_names: Vec<Vec<u8>> = interesting.as_ref()
.map(|names| names.iter().map(|n| n.as_bytes().to_vec()).collect())
.unwrap_or_default();
let use_bloom = !target_names.is_empty();
let mut all_results = Vec::with_capacity(docs.len());
for &doc in docs {
if use_bloom {
let bloom = crate::bloom::TagBloom::from_prescan(doc);
let refs: Vec<&[u8]> = target_names.iter().map(|n| n.as_slice()).collect();
if !bloom.may_contain_any(&refs) {
all_results.push(Vec::new());
continue;
}
}
let index = match &interesting {
Some(names) => crate::index::lazy::parse_for_query(doc, names)?,
None => crate::parse(doc)?,
};
let texts: Vec<String> = xpath.eval_text(&index)?
.into_iter().map(|s| s.to_string()).collect();
all_results.push(texts);
}
Ok(all_results)
}
pub fn count_batch(
docs: &[&[u8]],
xpath: &CompiledXPath,
) -> Result<Vec<usize>> {
let mut counts = Vec::with_capacity(docs.len());
for &doc in docs {
let index = crate::parse(doc)?;
let nodes = xpath.eval(&index)?;
counts.push(nodes.len());
}
Ok(counts)
}
const LARGE_DOC_THRESHOLD: usize = 256 * 1024;
pub fn eval_batch_parallel(
docs: &[&[u8]],
xpath: &CompiledXPath,
max_threads: usize,
) -> Result<Vec<Vec<String>>> {
if docs.is_empty() {
return Ok(Vec::new());
}
let max_threads = max_threads.max(1);
let interesting = xpath.interesting_names();
let target_names: Vec<Vec<u8>> = interesting.as_ref()
.map(|names| names.iter().map(|n| n.as_bytes().to_vec()).collect())
.unwrap_or_default();
let use_bloom = !target_names.is_empty();
let results: Vec<Result<Vec<String>>> = std::thread::scope(|scope| {
let doc_concurrency = max_threads.min(docs.len());
let handles: Vec<_> = docs.iter().enumerate().map(|(_i, &doc)| {
let interesting = &interesting;
let target_names = &target_names;
scope.spawn(move || -> Result<Vec<String>> {
if use_bloom {
let bloom = crate::bloom::TagBloom::from_prescan(doc);
let refs: Vec<&[u8]> = target_names.iter().map(|n| n.as_slice()).collect();
if !bloom.may_contain_any(&refs) {
return Ok(Vec::new());
}
}
let index = if doc.len() >= LARGE_DOC_THRESHOLD {
let doc_threads = (max_threads / doc_concurrency).max(2);
let mut idx = crate::parallel::parse_parallel(doc, doc_threads)?;
idx.ensure_indices();
idx
} else {
match interesting {
Some(names) => crate::index::lazy::parse_for_query(doc, names)?,
None => crate::parse(doc)?,
}
};
let texts: Vec<String> = xpath.eval_text(&index)?
.into_iter().map(|s| s.to_string()).collect();
Ok(texts)
})
}).collect();
handles.into_iter().map(|h| h.join().unwrap()).collect()
});
results.into_iter().collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn batch_of_one() {
let doc = b"<root><claim>A device</claim></root>";
let xpath = CompiledXPath::compile("//claim").unwrap();
let results = eval_batch_text(&[doc.as_slice()], &xpath).unwrap();
assert_eq!(results, vec![vec!["A device"]]);
}
#[test]
fn batch_multiple_docs() {
let docs: Vec<&[u8]> = vec![
b"<r><claim>First</claim></r>",
b"<r><claim>Second</claim><claim>Third</claim></r>",
b"<r><other>No claims</other></r>",
];
let xpath = CompiledXPath::compile("//claim").unwrap();
let results = eval_batch_text(&docs, &xpath).unwrap();
assert_eq!(results.len(), 3);
assert_eq!(results[0], vec!["First"]);
assert_eq!(results[1], vec!["Second", "Third"]);
assert!(results[2].is_empty());
}
#[test]
fn batch_matches_individual() {
let docs: Vec<&[u8]> = vec![
b"<r><a>1</a><b>2</b></r>",
b"<r><a>3</a></r>",
b"<r><b>4</b></r>",
];
let xpath = CompiledXPath::compile("//a").unwrap();
let batch = eval_batch_text(&docs, &xpath).unwrap();
for (i, &doc) in docs.iter().enumerate() {
let index = crate::parse(doc).unwrap();
let individual: Vec<String> = xpath.eval_text(&index).unwrap()
.into_iter().map(|s| s.to_string()).collect();
assert_eq!(individual, batch[i], "doc {} mismatch", i);
}
}
#[test]
fn batch_lazy_matches_full() {
let docs: Vec<&[u8]> = vec![
b"<r><claim>A</claim><other>skip</other></r>",
b"<r><claim>B</claim></r>",
];
let xpath = CompiledXPath::compile("//claim").unwrap();
let full = eval_batch_text(&docs, &xpath).unwrap();
let lazy = eval_batch_text_lazy(&docs, &xpath).unwrap();
assert_eq!(full, lazy);
}
#[test]
fn batch_bloom_skips_irrelevant() {
let docs: Vec<&[u8]> = vec![
b"<r><claim>A</claim></r>",
b"<r><other>no claims</other></r>",
b"<r><claim>B</claim></r>",
];
let xpath = CompiledXPath::compile("//claim").unwrap();
let results = eval_batch_text_bloom(&docs, &xpath).unwrap();
assert_eq!(results.len(), 3);
assert_eq!(results[0], vec!["A"]);
assert!(results[1].is_empty());
assert_eq!(results[2], vec!["B"]);
}
#[test]
fn batch_empty() {
let docs: Vec<&[u8]> = vec![];
let xpath = CompiledXPath::compile("//claim").unwrap();
let results = eval_batch_text(&docs, &xpath).unwrap();
assert!(results.is_empty());
}
#[test]
fn batch_predicate() {
let docs: Vec<&[u8]> = vec![
br#"<r><claim type="independent">A</claim><claim type="dependent">B</claim></r>"#,
br#"<r><claim type="dependent">C</claim></r>"#,
];
let xpath = CompiledXPath::compile("//claim[@type='independent']").unwrap();
let results = eval_batch_text(&docs, &xpath).unwrap();
assert_eq!(results[0], vec!["A"]);
assert!(results[1].is_empty());
}
#[test]
fn count_batch_works() {
let docs: Vec<&[u8]> = vec![
b"<r><a/><a/><b/></r>",
b"<r><a/></r>",
b"<r><b/></r>",
];
let xpath = CompiledXPath::compile("//a").unwrap();
let counts = count_batch(&docs, &xpath).unwrap();
assert_eq!(counts, vec![2, 1, 0]);
}
#[test]
fn batch_bloom_all_match() {
let docs: Vec<&[u8]> = vec![
b"<r><claim>A</claim></r>",
b"<r><claim>B</claim></r>",
];
let xpath = CompiledXPath::compile("//claim").unwrap();
let bloom_results = eval_batch_text_bloom(&docs, &xpath).unwrap();
let full_results = eval_batch_text(&docs, &xpath).unwrap();
assert_eq!(bloom_results, full_results);
}
#[test]
fn parallel_batch_matches_sequential() {
let docs: Vec<&[u8]> = vec![
b"<r><claim>A</claim><other>skip</other></r>",
b"<r><claim>B</claim><claim>C</claim></r>",
b"<r><other>no claims</other></r>",
];
let xpath = CompiledXPath::compile("//claim").unwrap();
let seq_results = eval_batch_text(&docs, &xpath).unwrap();
let par_results = eval_batch_parallel(&docs, &xpath, 4).unwrap();
assert_eq!(seq_results, par_results);
}
#[test]
fn parallel_batch_single_thread() {
let docs: Vec<&[u8]> = vec![
b"<r><a>1</a></r>",
b"<r><a>2</a></r>",
];
let xpath = CompiledXPath::compile("//a").unwrap();
let results = eval_batch_parallel(&docs, &xpath, 1).unwrap();
assert_eq!(results, vec![vec!["1"], vec!["2"]]);
}
#[test]
fn parallel_batch_empty() {
let docs: Vec<&[u8]> = vec![];
let xpath = CompiledXPath::compile("//a").unwrap();
let results = eval_batch_parallel(&docs, &xpath, 4).unwrap();
assert!(results.is_empty());
}
}