pub(crate) mod lines;
mod resolver;
pub mod verifier;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use resolver::resolve_doc;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
use regex::bytes::RegexBuilder;
use roaring::RoaringBitmap;
use crate::index::IndexSnapshot;
use crate::path::filter::{build_filter, matches_path_filter};
use crate::query::{literal_grams, route_query, GramQuery, QueryRoute};
use crate::{Config, IndexError, SearchMatch, SearchOptions};
pub(crate) const REGEX_SIZE_LIMIT: usize = 10 * 1024 * 1024;
use verifier::{verify_literal, verify_regex};
pub fn search(
snap: Arc<IndexSnapshot>,
config: &Config,
canonical_root: &std::path::Path,
pattern: &str,
opts: &SearchOptions,
) -> Result<Vec<SearchMatch>, IndexError> {
let route = route_query(pattern, opts.case_insensitive).map_err(IndexError::InvalidPattern)?;
let compiled_re = match &route {
QueryRoute::Literal => None,
_ => {
let re = RegexBuilder::new(pattern)
.case_insensitive(opts.case_insensitive)
.size_limit(REGEX_SIZE_LIMIT)
.dfa_size_limit(REGEX_SIZE_LIMIT)
.build()
.map_err(|e| IndexError::InvalidPattern(e.to_string()))?;
Some(re)
}
};
let candidates: Vec<u32> = match &route {
QueryRoute::Literal => match literal_grams(pattern) {
Some(hashes) => {
if should_use_index(&hashes, &snap)? {
let indexed = execute_query(&GramQuery::Grams(hashes), &snap)?;
if indexed.is_empty() {
all_doc_ids(&snap)
} else {
indexed
}
} else {
all_doc_ids(&snap)
}
}
None => all_doc_ids(&snap),
},
QueryRoute::IndexedRegex(query) => {
let indexed = execute_query(query, &snap)?;
if indexed.is_empty() {
all_doc_ids(&snap)
} else {
indexed
}
}
_ => all_doc_ids(&snap),
};
if std::env::var_os("SYNTEXT_LOG_SELECTIVITY").is_some() {
let total = snap.all_doc_ids().len() as usize;
let pct = if total > 0 {
candidates.len() as f64 / total as f64 * 100.0
} else {
0.0
};
eprintln!(
"selectivity: {:.2}% ({}/{}) route={:?} pattern={:?}",
pct,
candidates.len(),
total,
route,
pattern
);
}
let path_filter_bitmap = build_filter(
&snap.path_index,
opts.file_type.as_deref(),
opts.exclude_type.as_deref(),
opts.path_filter.as_deref(),
);
let match_count = AtomicUsize::new(0);
let do_match = |&global_id: &u32| -> Option<Vec<SearchMatch>> {
if let Some(limit) = opts.max_results {
if match_count.load(Ordering::Relaxed) >= limit {
return None;
}
}
let (rel_path, content) = resolve_doc(
&snap,
global_id,
canonical_root,
config.max_file_size,
config.verbose,
)?;
if let Some(ref pf) = path_filter_bitmap {
let file_id_opt = if (global_id as usize) < snap.base_doc_to_file_id.len() {
snap.base_doc_to_file_id
.get(global_id as usize)
.copied()
.filter(|&fid| fid != u32::MAX)
} else {
snap.overlay_doc_to_file_id.get(&global_id).copied()
};
if let Some(file_id) = file_id_opt {
if !pf.file_ids.contains(file_id) {
return None;
}
} else {
if !matches_path_filter(
&rel_path,
opts.file_type.as_deref(),
opts.exclude_type.as_deref(),
opts.path_filter.as_deref(),
) {
return None;
}
}
}
let file_path = rel_path.as_path();
let file_matches = match &route {
QueryRoute::Literal => verify_literal(pattern, file_path, &content),
_ => verify_regex(compiled_re.as_ref().unwrap(), file_path, &content),
};
if let Some(_limit) = opts.max_results {
if !file_matches.is_empty() {
match_count.fetch_add(file_matches.len(), Ordering::Relaxed);
}
}
Some(file_matches)
};
#[cfg(feature = "rayon")]
let all_matches: Vec<SearchMatch> = candidates
.par_iter()
.filter_map(do_match)
.flatten()
.collect();
#[cfg(not(feature = "rayon"))]
let all_matches: Vec<SearchMatch> = candidates.iter().filter_map(do_match).flatten().collect();
let mut matches = sort_matches(all_matches);
if let Some(max) = opts.max_results {
matches.truncate(max);
}
Ok(matches)
}
fn sort_matches(mut matches: Vec<SearchMatch>) -> Vec<SearchMatch> {
matches.sort_unstable_by(|a, b| {
a.path
.cmp(&b.path)
.then_with(|| a.line_number.cmp(&b.line_number))
});
matches
}
fn all_doc_ids(snap: &IndexSnapshot) -> Vec<u32> {
snap.all_doc_ids().iter().collect()
}
fn should_use_index(hashes: &[u64], snap: &IndexSnapshot) -> Result<bool, IndexError> {
if hashes.is_empty() {
return Ok(false);
}
let total_docs = snap.all_doc_ids().len();
if total_docs == 0 {
return Ok(false);
}
let mut ordered = hashes.to_vec();
ordered.sort_unstable_by_key(|&hash| gram_cardinality(hash, snap));
let smallest = ordered
.first()
.map(|&hash| gram_cardinality(hash, snap))
.unwrap_or(0);
if is_selective_enough(u64::from(smallest), total_docs, snap.scan_threshold) {
return Ok(true);
}
if ordered.len() == 1 {
return Ok(false);
}
let first = posting_bitmap(ordered[0], snap)?;
let second = posting_bitmap(ordered[1], snap)?;
let mut acc: RoaringBitmap = first.as_ref() & second.as_ref();
if acc.is_empty() || is_selective_enough(acc.len(), total_docs, snap.scan_threshold) {
return Ok(true);
}
if ordered.len() > 2 {
let third = posting_bitmap(ordered[2], snap)?;
acc &= third.as_ref();
if acc.is_empty() || is_selective_enough(acc.len(), total_docs, snap.scan_threshold) {
return Ok(true);
}
}
Ok(false)
}
fn execute_query(query: &GramQuery, snap: &IndexSnapshot) -> Result<Vec<u32>, IndexError> {
Ok(execute_query_bitmap(query, snap)?.iter().collect())
}
fn execute_query_bitmap(
query: &GramQuery,
snap: &IndexSnapshot,
) -> Result<RoaringBitmap, IndexError> {
match query {
GramQuery::And(children) => {
let mut ordered: Vec<_> = children.iter().collect();
ordered.sort_unstable_by_key(|child| query_cardinality_upper_bound(child, snap));
let mut iter = ordered.into_iter();
let Some(first) = iter.next() else {
return Ok(snap.all_doc_ids().clone());
};
let mut acc = execute_query_bitmap(first, snap)?;
for child in iter {
let child_bitmap = execute_query_bitmap(child, snap)?;
acc &= &child_bitmap;
if acc.is_empty() {
break;
}
}
Ok(acc)
}
GramQuery::Or(children) => {
let mut acc = RoaringBitmap::new();
for child in children {
let child_bitmap = execute_query_bitmap(child, snap)?;
acc |= &child_bitmap;
}
Ok(acc)
}
GramQuery::Grams(hashes) => {
let mut ordered = hashes.to_vec();
ordered.sort_unstable_by_key(|&hash| gram_cardinality(hash, snap));
let mut iter = ordered.into_iter();
let Some(first) = iter.next() else {
return Ok(snap.all_doc_ids().clone());
};
let first_bm = posting_bitmap(first, snap)?;
let mut acc: RoaringBitmap = if let Some(second) = iter.next() {
let second_bm = posting_bitmap(second, snap)?;
first_bm.as_ref() & second_bm.as_ref()
} else {
first_bm.as_ref().clone()
};
for hash in iter {
if acc.is_empty() {
break;
}
let postings = posting_bitmap(hash, snap)?;
acc &= postings.as_ref();
}
Ok(acc)
}
GramQuery::All => Ok(snap.all_doc_ids().clone()),
GramQuery::None => Ok(RoaringBitmap::new()),
}
}
fn gram_cardinality(gram_hash: u64, snap: &IndexSnapshot) -> u32 {
let base_total: u32 = snap
.base_segments()
.iter()
.filter_map(|seg| seg.gram_cardinality(gram_hash))
.sum();
let overlay_total = snap
.overlay
.gram_index
.get(&gram_hash)
.map_or(0, |ids| ids.len() as u32);
base_total.saturating_add(overlay_total)
}
fn query_cardinality_upper_bound(query: &GramQuery, snap: &IndexSnapshot) -> u32 {
let total_docs = snap.all_doc_ids().len() as u32;
match query {
GramQuery::And(children) => children
.iter()
.map(|child| query_cardinality_upper_bound(child, snap))
.min()
.unwrap_or(total_docs),
GramQuery::Or(children) => children
.iter()
.fold(0u32, |acc, child| {
acc.saturating_add(query_cardinality_upper_bound(child, snap))
})
.min(total_docs),
GramQuery::Grams(hashes) => hashes
.iter()
.map(|&hash| gram_cardinality(hash, snap))
.min()
.unwrap_or(total_docs),
GramQuery::All => total_docs,
GramQuery::None => 0,
}
}
fn is_selective_enough(candidate_count: u64, total_docs: u64, threshold: f64) -> bool {
(candidate_count as f64) <= (total_docs as f64) * threshold
}
fn posting_bitmap(gram_hash: u64, snap: &IndexSnapshot) -> Result<Arc<RoaringBitmap>, IndexError> {
if let Some(bitmap) = snap.cached_posting_bitmap(gram_hash) {
return Ok(bitmap);
}
let mut bitmap = RoaringBitmap::new();
for seg in snap.base_segments() {
if let Some(postings) = seg.lookup_gram(gram_hash) {
let ids = postings
.to_vec()
.map_err(|err| IndexError::CorruptIndex(err.to_string()))?;
bitmap.extend(ids);
}
}
if let Some(ids) = snap.overlay.gram_index.get(&gram_hash) {
bitmap.extend(ids.iter().copied());
}
bitmap -= &snap.delete_set;
Ok(snap.store_posting_bitmap(gram_hash, Arc::new(bitmap)))
}
#[cfg(test)]
mod tests;