Skip to main content

seekr_code/search/
text.rs

1//! Text regex search.
2//!
3//! Full-text regex matching using the `regex` crate.
4//! Supports case sensitivity and context line configuration.
5
6use std::path::Path;
7
8use regex::RegexBuilder;
9
10use crate::error::SearchError;
11use crate::index::store::SeekrIndex;
12use crate::parser::CodeChunk;
13
14/// Options for text search.
15#[derive(Debug, Clone)]
16pub struct TextSearchOptions {
17    /// Case-sensitive matching.
18    pub case_sensitive: bool,
19
20    /// Number of context lines before/after a match.
21    pub context_lines: usize,
22
23    /// Maximum number of results.
24    pub top_k: usize,
25}
26
27impl Default for TextSearchOptions {
28    fn default() -> Self {
29        Self {
30            case_sensitive: false,
31            context_lines: 2,
32            top_k: 20,
33        }
34    }
35}
36
37/// A single text match within a code chunk.
38#[derive(Debug, Clone)]
39pub struct TextMatch {
40    /// The chunk that matched.
41    pub chunk_id: u64,
42
43    /// Line numbers (0-indexed) that matched.
44    pub matched_lines: Vec<usize>,
45
46    /// Relevance score based on match count and density.
47    pub score: f32,
48}
49
50/// Perform text regex search across the index.
51///
52/// Searches through all indexed code chunks using regex pattern matching.
53/// Results are scored by the number and density of matches.
54pub fn search_text_regex(
55    index: &SeekrIndex,
56    query: &str,
57    options: &TextSearchOptions,
58) -> Result<Vec<TextMatch>, SearchError> {
59    let regex = RegexBuilder::new(query)
60        .case_insensitive(!options.case_sensitive)
61        .build()
62        .map_err(|e| SearchError::InvalidRegex(e.to_string()))?;
63
64    let mut matches: Vec<TextMatch> = Vec::new();
65
66    for (chunk_id, chunk) in &index.chunks {
67        let mut matched_lines = Vec::new();
68
69        for (line_idx, line) in chunk.body.lines().enumerate() {
70            if regex.is_match(line) {
71                matched_lines.push(line_idx);
72            }
73        }
74
75        if !matched_lines.is_empty() {
76            let total_lines = chunk.body.lines().count().max(1) as f32;
77            let match_count = matched_lines.len() as f32;
78
79            // Score combines match count with density
80            // More matches and higher density = higher score
81            let density = match_count / total_lines;
82            let score = match_count + density * 10.0;
83
84            matches.push(TextMatch {
85                chunk_id: *chunk_id,
86                matched_lines,
87                score,
88            });
89        }
90    }
91
92    // Sort by score descending
93    matches.sort_by(|a, b| {
94        b.score
95            .partial_cmp(&a.score)
96            .unwrap_or(std::cmp::Ordering::Equal)
97    });
98
99    // Truncate to top-k
100    matches.truncate(options.top_k);
101
102    Ok(matches)
103}
104
105/// Perform text regex search directly on files (without index).
106///
107/// Scans the file system for regex matches. Useful for ad-hoc searches
108/// before an index is built.
109pub fn search_text_in_file(
110    file_path: &Path,
111    query: &str,
112    case_sensitive: bool,
113) -> Result<Vec<(usize, String)>, SearchError> {
114    let regex = RegexBuilder::new(query)
115        .case_insensitive(!case_sensitive)
116        .build()
117        .map_err(|e| SearchError::InvalidRegex(e.to_string()))?;
118
119    let content = std::fs::read_to_string(file_path)
120        .map_err(|e| SearchError::Index(crate::error::IndexError::Io(e)))?;
121
122    let mut results = Vec::new();
123    for (line_idx, line) in content.lines().enumerate() {
124        if regex.is_match(line) {
125            results.push((line_idx, line.to_string()));
126        }
127    }
128
129    Ok(results)
130}
131
132/// Get context lines around matched lines.
133///
134/// Returns a list of (line_number, line_content, is_match) tuples.
135pub fn get_match_context(
136    chunk: &CodeChunk,
137    matched_lines: &[usize],
138    context_lines: usize,
139) -> Vec<(usize, String, bool)> {
140    let lines: Vec<&str> = chunk.body.lines().collect();
141    let total = lines.len();
142    let mut result: Vec<(usize, String, bool)> = Vec::new();
143    let mut included: std::collections::HashSet<usize> = std::collections::HashSet::new();
144
145    for &match_line in matched_lines {
146        let start = match_line.saturating_sub(context_lines);
147        let end = (match_line + context_lines + 1).min(total);
148
149        for (line_idx, line) in lines.iter().enumerate().take(end).skip(start) {
150            if included.insert(line_idx) {
151                let is_match = matched_lines.contains(&line_idx);
152                result.push((
153                    line_idx + chunk.line_range.start, // absolute line number
154                    line.to_string(),
155                    is_match,
156                ));
157            }
158        }
159    }
160
161    result.sort_by_key(|(line, _, _)| *line);
162    result
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168    use crate::parser::ChunkKind;
169    use std::path::PathBuf;
170
171    fn make_chunk(id: u64, body: &str) -> CodeChunk {
172        CodeChunk {
173            id,
174            file_path: PathBuf::from("test.rs"),
175            language: "rust".to_string(),
176            kind: ChunkKind::Function,
177            name: Some("test_fn".to_string()),
178            signature: None,
179            doc_comment: None,
180            body: body.to_string(),
181            byte_range: 0..body.len(),
182            line_range: 0..body.lines().count(),
183        }
184    }
185
186    #[test]
187    fn test_text_search_regex() {
188        let mut index = SeekrIndex::new(4);
189        let chunk = make_chunk(1, "fn authenticate(user: &str) {\n    validate(user);\n}\n");
190        let entry = crate::index::IndexEntry {
191            chunk_id: 1,
192            embedding: vec![0.1; 4],
193            text_tokens: vec!["authenticate".to_string()],
194        };
195        index.add_entry(entry, chunk);
196
197        let options = TextSearchOptions {
198            case_sensitive: false,
199            context_lines: 0,
200            top_k: 10,
201        };
202
203        let results = search_text_regex(&index, "authenticate", &options).unwrap();
204        assert_eq!(results.len(), 1);
205        assert_eq!(results[0].chunk_id, 1);
206        assert!(!results[0].matched_lines.is_empty());
207    }
208
209    #[test]
210    fn test_text_search_case_insensitive() {
211        let mut index = SeekrIndex::new(4);
212        let chunk = make_chunk(1, "fn Authenticate(user: &str) {}");
213        let entry = crate::index::IndexEntry {
214            chunk_id: 1,
215            embedding: vec![0.1; 4],
216            text_tokens: vec!["authenticate".to_string()],
217        };
218        index.add_entry(entry, chunk);
219
220        let options = TextSearchOptions {
221            case_sensitive: false,
222            ..Default::default()
223        };
224
225        let results = search_text_regex(&index, "authenticate", &options).unwrap();
226        assert_eq!(results.len(), 1);
227    }
228
229    #[test]
230    fn test_context_lines() {
231        let chunk = make_chunk(1, "line 0\nline 1\nMATCH line 2\nline 3\nline 4\n");
232        let context = get_match_context(&chunk, &[2], 1);
233
234        assert!(context.len() >= 3); // at least match + 1 before + 1 after
235        let line_nums: Vec<usize> = context.iter().map(|(l, _, _)| *l).collect();
236        assert!(line_nums.contains(&1));
237        assert!(line_nums.contains(&2));
238        assert!(line_nums.contains(&3));
239    }
240
241    #[test]
242    fn test_invalid_regex() {
243        let index = SeekrIndex::new(4);
244        let options = TextSearchOptions::default();
245
246        let result = search_text_regex(&index, "[invalid", &options);
247        assert!(result.is_err());
248    }
249}