use anyhow::{Context, Result};
use memchr::memmem;
use regex::{Regex, RegexBuilder};
use crate::heading::parse_atx_heading;
use crate::scanner::{FileVisitor, ScanAction};
use crate::types::ContentMatch;
#[derive(Debug)]
enum SearchMode {
Substring {
lowered: String,
finder: memmem::Finder<'static>,
},
Regex(Regex),
}
#[derive(Debug)]
pub struct ContentSearchVisitor {
mode: SearchMode,
current_section: String,
matches: Vec<ContentMatch>,
line_scratch: Vec<u8>,
}
impl ContentSearchVisitor {
#[must_use]
pub fn new(pattern: &str) -> Self {
let lowered = pattern.to_ascii_lowercase();
let finder = memmem::Finder::new(lowered.as_bytes()).into_owned();
Self {
mode: SearchMode::Substring { lowered, finder },
current_section: String::new(),
matches: Vec::new(),
line_scratch: Vec::new(),
}
}
#[must_use = "returns a compiled regex visitor; call has no side effects"]
pub fn regex(pattern: &str) -> Result<Self> {
let effective = format!("(?i){pattern}");
let re = RegexBuilder::new(&effective)
.size_limit(1 << 20) .build()
.with_context(|| format!("invalid regular expression: {pattern}"))?;
Ok(Self {
mode: SearchMode::Regex(re),
current_section: String::new(),
matches: Vec::new(),
line_scratch: Vec::new(),
})
}
#[must_use]
pub fn from_compiled(re: Regex) -> Self {
Self {
mode: SearchMode::Regex(re),
current_section: String::new(),
matches: Vec::new(),
line_scratch: Vec::new(),
}
}
#[must_use]
pub fn has_matches(&self) -> bool {
!self.matches.is_empty()
}
#[must_use]
pub fn into_matches(self) -> Vec<ContentMatch> {
self.matches
}
pub fn pattern_bytes(&self) -> Option<&[u8]> {
match &self.mode {
SearchMode::Substring { lowered, .. } => Some(lowered.as_bytes()),
SearchMode::Regex(_) => None,
}
}
fn is_match(&mut self, line: &str) -> bool {
match &self.mode {
SearchMode::Substring { finder, .. } => {
self.line_scratch.clear();
self.line_scratch
.extend(line.bytes().map(|b| b.to_ascii_lowercase()));
finder.find(&self.line_scratch).is_some()
}
SearchMode::Regex(re) => re.is_match(line),
}
}
}
pub fn fast_reject(file_data: &[u8], pattern: &[u8], scratch: &mut Vec<u8>) -> bool {
let finder = memmem::Finder::new(pattern);
scratch.clear();
scratch.extend(file_data.iter().map(u8::to_ascii_lowercase));
finder.find(scratch).is_none()
}
impl FileVisitor for ContentSearchVisitor {
fn on_body_line(&mut self, raw: &str, _cleaned: &str, line_num: usize) -> ScanAction {
if let Some((level, heading_text)) = parse_atx_heading(raw) {
self.current_section = format!("{} {}", "#".repeat(level as usize), heading_text);
}
if self.is_match(raw) {
self.matches.push(ContentMatch {
line: line_num,
section: self.current_section.clone(),
text: raw.to_owned(),
});
}
ScanAction::Continue
}
fn on_code_block_line(&mut self, raw: &str, line_num: usize) -> ScanAction {
if self.is_match(raw) {
self.matches.push(ContentMatch {
line: line_num,
section: self.current_section.clone(),
text: raw.to_owned(),
});
}
ScanAction::Continue
}
fn needs_frontmatter(&self) -> bool {
false
}
}
#[cfg(test)]
mod tests {
use super::*;
fn run_visitor(content: &str, pattern: &str) -> Vec<ContentMatch> {
let mut visitor = ContentSearchVisitor::new(pattern);
let lines: Vec<&str> = content.lines().collect();
for (i, line) in lines.iter().enumerate() {
visitor.on_body_line(line, line, i + 1);
}
visitor.into_matches()
}
fn run_regex_visitor(content: &str, pattern: &str) -> Vec<ContentMatch> {
let mut visitor = ContentSearchVisitor::regex(pattern).unwrap();
let lines: Vec<&str> = content.lines().collect();
for (i, line) in lines.iter().enumerate() {
visitor.on_body_line(line, line, i + 1);
}
visitor.into_matches()
}
#[test]
fn finds_exact_match() {
let matches = run_visitor("Hello world\nnothing here\n", "world");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].line, 1);
assert_eq!(matches[0].text, "Hello world");
}
#[test]
fn case_insensitive_match() {
let matches = run_visitor("Hello WORLD\nGoodbye world\n", "world");
assert_eq!(matches.len(), 2);
}
#[test]
fn uppercase_pattern_matches_lowercase_line() {
let matches = run_visitor("foo bar baz\n", "BAR");
assert_eq!(matches.len(), 1);
}
#[test]
fn no_match_returns_empty() {
let matches = run_visitor("Nothing relevant here\n", "zzz");
assert!(matches.is_empty());
}
#[test]
fn has_matches_false_when_empty() {
let visitor = ContentSearchVisitor::new("x");
assert!(!visitor.has_matches());
}
#[test]
fn has_matches_true_after_match() {
let mut visitor = ContentSearchVisitor::new("hello");
visitor.on_body_line("say hello", "say hello", 1);
assert!(visitor.has_matches());
}
#[test]
fn correct_line_numbers() {
let content = "line one\nline two\nline three\n";
let matches = run_visitor(content, "two");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].line, 2);
}
#[test]
fn section_tracking_updates_on_heading() {
let content = "## Design\nsome text\n### Sub\nother text\n";
let matches = run_visitor(content, "text");
assert_eq!(matches.len(), 2);
assert_eq!(matches[0].section, "## Design");
assert_eq!(matches[1].section, "### Sub");
}
#[test]
fn section_empty_before_first_heading() {
let matches = run_visitor("intro text\n## Section\n", "intro");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].section, "");
}
#[test]
fn heading_line_itself_can_be_matched() {
let matches = run_visitor("## Design Goals\n", "design");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "## Design Goals");
assert_eq!(matches[0].section, "## Design Goals");
}
#[test]
fn heading_not_matched_when_no_pattern() {
let matches = run_visitor("## Design\nsome content\n", "content");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].section, "## Design");
}
#[test]
fn into_matches_consumes_visitor() {
let mut visitor = ContentSearchVisitor::new("hello");
visitor.on_body_line("say hello", "say hello", 1);
visitor.on_body_line("hello again", "hello again", 2);
let matches = visitor.into_matches();
assert_eq!(matches.len(), 2);
}
#[test]
fn level_1_heading_tracked() {
let matches = run_visitor("# Top Level\nbody\n", "body");
assert_eq!(matches[0].section, "# Top Level");
}
#[test]
fn invalid_atx_heading_not_tracked() {
let matches = run_visitor("#NoSpace\nbody\n", "body");
assert_eq!(matches[0].section, "");
}
#[test]
fn heading_with_inline_code_span_preserved_in_section() {
let content = "## The `versions` field\nsome text\n";
let matches = run_visitor(content, "text");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].section, "## The `versions` field");
}
#[test]
fn heading_with_inline_code_span_is_matchable() {
let content = "## The `versions` field\n";
let matches = run_visitor(content, "versions");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "## The `versions` field");
}
#[test]
fn regex_simple_match() {
let matches = run_regex_visitor("Hello world\nnothing here\n", "wor.d");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "Hello world");
}
#[test]
fn regex_case_insensitive_by_default() {
let matches = run_regex_visitor("Hello WORLD\nGoodbye world\n", "world");
assert_eq!(matches.len(), 2);
}
#[test]
fn regex_alternation() {
let matches = run_regex_visitor("TODO fix this\nFIXME later\nall good\n", "TODO|FIXME");
assert_eq!(matches.len(), 2);
}
#[test]
fn regex_anchored() {
let matches = run_regex_visitor("## Design\nnot a heading\n", r"^##\s");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "## Design");
}
#[test]
fn regex_explicit_case_sensitive() {
let matches = run_regex_visitor("Hello WORLD\nGoodbye world\n", "(?-i)WORLD");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "Hello WORLD");
}
#[test]
fn regex_user_flag_overrides_default() {
let matches = run_regex_visitor("Hello WORLD\nGoodbye world\n", "(?-i)world");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].text, "Goodbye world");
}
#[test]
fn regex_section_tracking() {
let content = "## Tasks\n- TODO item\n### Done\n- completed\n";
let matches = run_regex_visitor(content, "TODO|completed");
assert_eq!(matches.len(), 2);
assert_eq!(matches[0].section, "## Tasks");
assert_eq!(matches[1].section, "### Done");
}
#[test]
fn regex_invalid_returns_error() {
let result = ContentSearchVisitor::regex("[invalid");
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("invalid regular expression"), "got: {err}");
}
#[test]
fn regex_non_capturing_group_still_case_insensitive() {
let matches = run_regex_visitor("Hello WORLD\nGoodbye world\n", "(?:world)");
assert_eq!(matches.len(), 2);
}
#[test]
fn regex_empty_pattern_matches_everything() {
let matches = run_regex_visitor("line one\nline two\n", "");
assert_eq!(matches.len(), 2);
}
#[test]
fn regex_no_match_returns_empty() {
let matches = run_regex_visitor("Nothing here\n", r"\d{4}-\d{2}-\d{2}");
assert!(matches.is_empty());
}
#[test]
fn regex_rejects_oversized_pattern() {
let huge = (0..50_000)
.map(|i| format!("word{i}"))
.collect::<Vec<_>>()
.join("|");
let result = ContentSearchVisitor::regex(&huge);
assert!(result.is_err(), "oversized pattern should be rejected");
}
fn run_full_scan(content: &str, pattern: &str) -> Vec<ContentMatch> {
use crate::scanner::scan_reader_multi;
let mut visitor = ContentSearchVisitor::new(pattern);
scan_reader_multi(content.as_bytes(), &mut [&mut visitor]).unwrap();
visitor.into_matches()
}
#[test]
fn finds_match_inside_code_block() {
let content = "---\n---\n## Code\n```rust\nlet typescript = 42;\n```\n";
let matches = run_full_scan(content, "typescript");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].line, 5);
assert_eq!(matches[0].section, "## Code");
}
#[test]
fn finds_match_inside_code_block_regex() {
use crate::scanner::scan_reader_multi;
let content = "---\n---\n```\nfoo_bar_baz\n```\n";
let mut visitor = ContentSearchVisitor::regex("foo.*baz").unwrap();
scan_reader_multi(content.as_bytes(), &mut [&mut visitor]).unwrap();
let matches = visitor.into_matches();
assert_eq!(matches.len(), 1);
}
#[test]
fn code_block_match_outside_and_inside() {
let content = "---\n---\nhello world\n```\nhello code\n```\n";
let matches = run_full_scan(content, "hello");
assert_eq!(matches.len(), 2);
}
#[test]
fn heading_inside_code_block_not_tracked_as_section() {
let content = "---\n---\n## Real Section\n```\n# not a heading\nfoo\n```\nbar\n";
let matches = run_full_scan(content, "bar");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].section, "## Real Section");
}
#[test]
fn no_match_inside_code_block_when_pattern_absent() {
let content = "---\n---\n```\nsome code here\n```\n";
let matches = run_full_scan(content, "zzz");
assert!(matches.is_empty());
}
#[test]
fn finder_empty_needle_matches_everything() {
assert_eq!(run_visitor("anything", "").len(), 1);
assert!(run_visitor("", "").is_empty()); }
#[test]
fn finder_needle_longer_than_haystack() {
let matches = run_visitor("ab", "abc");
assert!(matches.is_empty());
}
#[test]
fn finder_exact_match() {
let matches = run_visitor("hello", "hello");
assert_eq!(matches.len(), 1);
}
#[test]
fn finder_mixed_case_match() {
let matches = run_visitor("Hello WORLD", "lo wor");
assert_eq!(matches.len(), 1);
}
#[test]
fn finder_no_match() {
let matches = run_visitor("hello world", "xyz");
assert!(matches.is_empty());
}
#[test]
fn finder_multibyte_utf8_in_haystack() {
let matches = run_visitor("café latte", "latte");
assert_eq!(matches.len(), 1);
let matches2 = run_visitor("über cool", "cool");
assert_eq!(matches2.len(), 1);
}
#[test]
fn finder_match_at_end() {
let matches = run_visitor("say HELLO", "hello");
assert_eq!(matches.len(), 1);
}
#[test]
fn finder_single_char() {
assert_eq!(run_visitor("A", "a").len(), 1);
assert!(run_visitor("A", "b").is_empty());
}
#[test]
fn pattern_bytes_substring_mode() {
let visitor = ContentSearchVisitor::new("Hello");
assert_eq!(visitor.pattern_bytes(), Some(b"hello".as_ref()));
}
#[test]
fn pattern_bytes_regex_mode() {
let visitor = ContentSearchVisitor::regex("world").unwrap();
assert_eq!(visitor.pattern_bytes(), None);
}
#[test]
fn fast_reject_no_match_returns_true() {
let mut scratch = Vec::new();
assert!(fast_reject(b"hello world", b"xyz", &mut scratch));
}
#[test]
fn fast_reject_match_returns_false() {
let mut scratch = Vec::new();
assert!(!fast_reject(b"hello world", b"world", &mut scratch));
}
#[test]
fn fast_reject_case_insensitive() {
let mut scratch = Vec::new();
assert!(!fast_reject(b"Hello WORLD", b"world", &mut scratch));
assert!(!fast_reject(b"Hello WORLD", b"hello", &mut scratch));
}
#[test]
fn fast_reject_empty_pattern_never_rejects() {
let mut scratch = Vec::new();
assert!(!fast_reject(b"anything", b"", &mut scratch));
}
#[test]
fn fast_reject_empty_data_with_nonempty_pattern() {
let mut scratch = Vec::new();
assert!(fast_reject(b"", b"abc", &mut scratch));
}
}