mod executor;
pub(crate) mod lines;
mod resolver;
pub mod verifier;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use resolver::resolve_doc;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
use regex::bytes::RegexBuilder;
use roaring::RoaringBitmap;
use crate::index::IndexSnapshot;
use crate::path::filter::{build_filter, matches_path_filter};
use crate::query::{literal_grams, route_query, GramQuery, QueryRoute};
use crate::{Config, IndexError, SearchMatch, SearchOptions};
use executor::{execute_query, gram_cardinality, is_selective_enough, posting_bitmap};
pub(crate) const REGEX_SIZE_LIMIT: usize = 10 * 1024 * 1024;
use verifier::{verify_literal, verify_regex};
pub fn search(
snap: Arc<IndexSnapshot>,
config: &Config,
canonical_root: &std::path::Path,
pattern: &str,
opts: &SearchOptions,
) -> Result<Vec<SearchMatch>, IndexError> {
let route = route_query(pattern, opts.case_insensitive).map_err(IndexError::InvalidPattern)?;
let compiled_re = match &route {
QueryRoute::Literal => None,
_ => {
let re = RegexBuilder::new(pattern)
.case_insensitive(opts.case_insensitive)
.size_limit(REGEX_SIZE_LIMIT)
.dfa_size_limit(REGEX_SIZE_LIMIT)
.build()
.map_err(|e| IndexError::InvalidPattern(e.to_string()))?;
Some(re)
}
};
let candidates: Vec<u32> = match &route {
QueryRoute::Literal => match literal_grams(pattern) {
Some(hashes) => {
if should_use_index(&hashes, &snap)? {
let indexed = execute_query(&GramQuery::Grams(hashes), &snap)?;
if indexed.is_empty() {
all_doc_ids(&snap)
} else {
indexed
}
} else {
all_doc_ids(&snap)
}
}
None => all_doc_ids(&snap),
},
QueryRoute::IndexedRegex(query) => {
let indexed = execute_query(query, &snap)?;
if indexed.is_empty() {
all_doc_ids(&snap)
} else {
indexed
}
}
_ => all_doc_ids(&snap),
};
if std::env::var_os("SYNTEXT_LOG_SELECTIVITY").is_some() {
let total = snap.all_doc_ids().len() as usize;
let pct = if total > 0 {
candidates.len() as f64 / total as f64 * 100.0
} else {
0.0
};
eprintln!(
"selectivity: {:.2}% ({}/{}) route={:?} pattern={:?}",
pct,
candidates.len(),
total,
route,
pattern
);
}
let path_filter_bitmap = build_filter(
&snap.path_index,
opts.file_type.as_deref(),
opts.exclude_type.as_deref(),
opts.path_filter.as_deref(),
);
let match_count = AtomicUsize::new(0);
let do_match = |&global_id: &u32| -> Option<Vec<SearchMatch>> {
if let Some(limit) = opts.max_results {
if match_count.load(Ordering::Relaxed) >= limit {
return None;
}
}
let (rel_path, content) = resolve_doc(
&snap,
global_id,
canonical_root,
config.max_file_size,
config.verbose,
)?;
if let Some(ref pf) = path_filter_bitmap {
let file_id_opt = if (global_id as usize) < snap.base_doc_to_file_id.len() {
snap.base_doc_to_file_id
.get(global_id as usize)
.copied()
.filter(|&fid| fid != u32::MAX)
} else {
snap.overlay_doc_to_file_id.get(&global_id).copied()
};
if let Some(file_id) = file_id_opt {
if !pf.file_ids.contains(file_id) {
return None;
}
} else {
if !matches_path_filter(
&rel_path,
opts.file_type.as_deref(),
opts.exclude_type.as_deref(),
opts.path_filter.as_deref(),
) {
return None;
}
}
}
let file_path = rel_path.as_path();
let file_matches = match &route {
QueryRoute::Literal => verify_literal(pattern, file_path, &content),
_ => verify_regex(compiled_re.as_ref().unwrap(), file_path, &content),
};
if let Some(_limit) = opts.max_results {
if !file_matches.is_empty() {
match_count.fetch_add(file_matches.len(), Ordering::Relaxed);
}
}
Some(file_matches)
};
#[cfg(feature = "rayon")]
let all_matches: Vec<SearchMatch> = candidates
.par_iter()
.filter_map(do_match)
.flatten()
.collect();
#[cfg(not(feature = "rayon"))]
let all_matches: Vec<SearchMatch> = candidates.iter().filter_map(do_match).flatten().collect();
let mut matches = sort_matches(all_matches);
if let Some(max) = opts.max_results {
matches.truncate(max);
}
Ok(matches)
}
fn sort_matches(mut matches: Vec<SearchMatch>) -> Vec<SearchMatch> {
matches.sort_unstable_by(|a, b| {
a.path
.cmp(&b.path)
.then_with(|| a.line_number.cmp(&b.line_number))
});
matches
}
fn all_doc_ids(snap: &IndexSnapshot) -> Vec<u32> {
snap.all_doc_ids().iter().collect()
}
fn should_use_index(hashes: &[u64], snap: &IndexSnapshot) -> Result<bool, IndexError> {
if hashes.is_empty() {
return Ok(false);
}
let total_docs = snap.all_doc_ids().len();
if total_docs == 0 {
return Ok(false);
}
let mut ordered = hashes.to_vec();
ordered.sort_unstable_by_key(|&hash| gram_cardinality(hash, snap));
let smallest = ordered
.first()
.map(|&hash| gram_cardinality(hash, snap))
.unwrap_or(0);
if is_selective_enough(u64::from(smallest), total_docs, snap.scan_threshold) {
return Ok(true);
}
if ordered.len() == 1 {
return Ok(false);
}
let first = posting_bitmap(ordered[0], snap)?;
let second = posting_bitmap(ordered[1], snap)?;
let mut acc: RoaringBitmap = first.as_ref() & second.as_ref();
if acc.is_empty() || is_selective_enough(acc.len(), total_docs, snap.scan_threshold) {
return Ok(true);
}
if ordered.len() > 2 {
let third = posting_bitmap(ordered[2], snap)?;
acc &= third.as_ref();
if acc.is_empty() || is_selective_enough(acc.len(), total_docs, snap.scan_threshold) {
return Ok(true);
}
}
Ok(false)
}
#[cfg(test)]
mod tests;