use rayon::prelude::*;
use std::collections::BTreeSet;
use std::sync::OnceLock;
use crate::rules::{is_word_byte, AcMeta, ResidualShard, RuleSet};
use crate::scan_format::{
build_line_index, end_in_line_indexed, format_hit, is_likely_binary, line_and_col_indexed,
};
pub fn scan_content(path: &str, content: &[u8], rs: &RuleSet) -> Vec<String> {
let mut hits: Vec<String> = Vec::new();
if is_likely_binary(content) {
return hits;
}
let mut prefix_matched: BTreeSet<usize> = BTreeSet::new();
let line_index: OnceLock<Vec<usize>> = OnceLock::new();
if let Some(ac) = &rs.ac {
for m in ac.find_overlapping_iter(content) {
let pid = m.pattern().as_usize();
match &rs.ac_meta[pid] {
AcMeta::Literal { idx, bound_left, bound_right } => {
if *bound_left
&& m.start() > 0
&& is_word_byte(content[m.start() - 1])
{
continue;
}
if *bound_right
&& m.end() < content.len()
&& is_word_byte(content[m.end()])
{
continue;
}
let li = line_index.get_or_init(|| build_line_index(content));
let (line, col_start) = line_and_col_indexed(li, m.start());
let end = end_in_line_indexed(li, m.start(), m.end());
let (_, col_end) =
line_and_col_indexed(li, if end > 0 { end - 1 } else { 0 });
hits.push(format_hit(path, line, col_start, col_end, *idx));
}
AcMeta::RegexPrefix { rule_pos } => {
prefix_matched.insert(*rule_pos);
}
}
}
}
if let Some(ac_ci) = &rs.ac_ci {
for m in ac_ci.find_overlapping_iter(content) {
let pid = m.pattern().as_usize();
match &rs.ac_meta_ci[pid] {
AcMeta::Literal { .. } => {
}
AcMeta::RegexPrefix { rule_pos } => {
prefix_matched.insert(*rule_pos);
}
}
}
}
if !prefix_matched.is_empty() {
let positions: Vec<usize> = prefix_matched.iter().copied().collect();
let regex_hits: Vec<String> = positions
.par_iter()
.flat_map_iter(|&pos| {
let rr = &rs.regex_rules[pos];
let mut local: Vec<String> = Vec::new();
if let Ok(matches) = rr.re.find_all(content) {
let li = line_index.get_or_init(|| build_line_index(content));
for m in matches {
if m.start == m.end {
continue;
}
let (line, col_start) = line_and_col_indexed(li, m.start);
let end = end_in_line_indexed(li, m.start, m.end);
let (_, col_end) =
line_and_col_indexed(li, if end > 0 { end - 1 } else { 0 });
local.push(format_hit(path, line, col_start, col_end, rr.idx));
}
}
local
})
.collect();
hits.extend(regex_hits);
}
for shard in &rs.residual_shards {
match shard {
ResidualShard::Single { rule_pos } => {
let rr = &rs.regex_rules[*rule_pos];
if let Ok(matches) = rr.re.find_all(content) {
if !matches.is_empty() {
let li = line_index.get_or_init(|| build_line_index(content));
for m in matches {
if m.start == m.end {
continue;
}
let (line, col_start) = line_and_col_indexed(li, m.start);
let end = end_in_line_indexed(li, m.start, m.end);
let (_, col_end) = line_and_col_indexed(
li,
if end > 0 { end - 1 } else { 0 },
);
hits.push(format_hit(path, line, col_start, col_end, rr.idx));
}
}
}
}
ResidualShard::Combined { gate, positions } => {
if gate.is_match(content) {
let regex_hits: Vec<String> = positions
.par_iter()
.flat_map_iter(|&pos| {
let rr = &rs.regex_rules[pos];
let mut local: Vec<String> = Vec::new();
if let Ok(matches) = rr.re.find_all(content) {
let li = line_index.get_or_init(|| build_line_index(content));
for m in matches {
if m.start == m.end {
continue;
}
let (line, col_start) = line_and_col_indexed(li, m.start);
let end = end_in_line_indexed(li, m.start, m.end);
let (_, col_end) = line_and_col_indexed(
li,
if end > 0 { end - 1 } else { 0 },
);
local.push(format_hit(path, line, col_start, col_end, rr.idx));
}
}
local
})
.collect();
hits.extend(regex_hits);
}
}
}
}
hits
}