siftdb_core/
query.rs

1use anyhow::Result;
2use crate::storage::{SegmentReader, decode_line_table, byte_to_line};
3use crate::types::*;
4use globset::Glob;
5use crate::index::{PathIndex, HandlesMap};
6use regex::Regex;
7
8impl Snapshot {
9    /// Find substring matches in the collection with optimization
10    pub fn find(&mut self, query: &str, path_glob: Option<&str>, limit: Option<usize>) -> Result<Vec<Hit>> {
11        // Use cached indexes instead of reloading from disk every time
12        let path_index = &self.path_index;
13        let handles_map = &self.handles_map;
14        
15        let limit = limit.unwrap_or(1000);
16        let mut hits = Vec::new();
17        
18        // Compile path filter if provided
19        let path_filter = if let Some(glob) = path_glob {
20            Some(globset::Glob::new(glob)?.compile_matcher())
21        } else {
22            None
23        };
24
25        // O(1) term lookup using inverted index
26        let candidate_files = if self.inverted_index.term_count() > 0 {
27            // Use inverted index for O(1) lookup
28            self.inverted_index.find_files_with_term(query)
29        } else {
30            // Fallback to old O(n) scanning if no inverted index
31            let mut all_files = std::collections::HashSet::new();
32            for &handle in self.path_index.paths.values() {
33                all_files.insert(handle as u32);
34            }
35            all_files
36        };
37        
38        // Process only the candidate files (much smaller set!)
39        for handle in candidate_files {
40            if hits.len() >= limit {
41                break;
42            }
43            
44            // Get path for this handle
45            let path = if let Some((path, _)) = self.path_index.paths.iter()
46                .find(|(_, &h)| h == handle as u64) {
47                path
48            } else {
49                continue;
50            };
51            
52            // Apply path glob filter if specified
53            if let Some(ref filter) = path_filter {
54                if !filter.is_match(path) {
55                    continue;
56                }
57            }
58            
59            if let Some(metadata) = self.handles_map.get_metadata(handle as u64) {
60                let store_path = self.collection_path.join("store");
61                
62                // Use cached reader or create new one
63                let reader = if let Some(reader) = self.segment_cache.get_mut(&metadata.seg_id) {
64                    reader
65                } else {
66                    let new_reader = SegmentReader::new(&store_path, metadata.seg_id)?;
67                    self.segment_cache.insert(metadata.seg_id, new_reader);
68                    self.segment_cache.get_mut(&metadata.seg_id).unwrap()
69                };
70                
71                let frame = reader.read_frame(metadata)?;
72                
73                // Convert content to string
74                if let Ok(content_str) = String::from_utf8(frame.content.clone()) {
75                    // Find all matches in this file
76                    let newline_positions = decode_line_table(&frame.line_table)?;
77                    
78                    for (byte_offset, line_content) in find_matches_in_content(&content_str, query) {
79                        let line_num = byte_to_line(byte_offset, &newline_positions);
80                        
81                        hits.push(Hit {
82                            path: path.clone(),
83                            line: line_num,
84                            text: line_content,
85                        });
86                        
87                        if hits.len() >= limit {
88                            return Ok(hits);
89                        }
90                    }
91                }
92            }
93        }
94        
95        Ok(hits)
96    }
97    
98    /// Regex search with trigram acceleration (Milestone 0.2 feature)
99    pub fn regex_find(&self, pattern: &str, path_glob: Option<&str>, limit: Option<usize>) -> Result<Vec<Hit>> {
100        let path_index = PathIndex::read_from_file(&self.collection_path.join("index/path.json"))?;
101        let handles_map = HandlesMap::read_from_file(&self.collection_path.join("index/handles.json"))?;
102        
103        // TODO: Implement trigram optimization later
104        let trigram_candidates: Option<Vec<u32>> = None;
105
106        let regex = Regex::new(pattern)?;
107        let mut hits = Vec::new();
108        let limit = limit.unwrap_or(1000);
109        
110        // Get file handles to search - convert u64 to u32 for compatibility
111        let file_handles: Vec<u32> = if let Some(candidates) = trigram_candidates {
112            candidates.into_iter().collect()
113        } else {
114            // Fall back to searching all files if no trigram index
115            path_index.paths.values().map(|&h| h as u32).collect()
116        };
117        
118        // Filter by path glob if provided
119        let path_filter = path_glob.map(|pattern| {
120            Glob::new(pattern).unwrap().compile_matcher()
121        });
122
123        for file_handle in file_handles {
124            if hits.len() >= limit {
125                break;
126            }
127            
128            if let Some(metadata) = handles_map.handles.get(&(file_handle as u64)) {
129                // Check path glob filter if provided
130                if let Some(ref filter) = path_filter {
131                    if let Some(path_entry) = path_index.paths.iter().find(|(_, &h)| h as u32 == file_handle) {
132                        if !filter.is_match(path_entry.0) {
133                            continue;
134                        }
135                    }
136                }
137
138                // Read the segment file
139                let mut reader = SegmentReader::new(&self.collection_path.join("segments"), metadata.seg_id)?;
140                
141                if let Ok(frame) = reader.read_frame(metadata) {
142                    let content_str = String::from_utf8_lossy(&frame.content);
143                    
144                    // Apply regex to each line
145                    for (line_idx, line) in content_str.lines().enumerate() {
146                        if regex.is_match(line) {
147                            let file_path = path_index.paths.iter()
148                                .find(|(_, &h)| h as u32 == file_handle)
149                                .map(|(path, _)| path.to_string())
150                                .unwrap_or_else(|| "unknown".to_string());
151                                
152                            hits.push(Hit {
153                                path: file_path,
154                                line: (line_idx + 1) as u32,
155                                text: line.to_string(),
156                            });
157                            
158                            if hits.len() >= limit {
159                                break;
160                            }
161                        }
162                    }
163                }
164            }
165        }
166
167        Ok(hits)
168    }
169    
170    /// Grep with regex patterns (for future implementation)
171    pub fn grep(&mut self, pattern: &str, path_glob: Option<&str>, limit: Option<usize>) -> Result<Vec<Hit>> {
172        // For MVP, we'll treat this as a simple substring search
173        // TODO: Implement proper regex support
174        self.find(pattern, path_glob, limit)
175    }
176    
177    /// Open a text span from a file
178    pub fn open_span(&self, path: &str, start_line: u32, end_line: u32) -> Result<TextSpan> {
179        let path_index = PathIndex::read_from_file(&self.collection_path.join("index/path.json"))?;
180        let handles_map = HandlesMap::read_from_file(&self.collection_path.join("index/handles.json"))?;
181        
182        let handle = path_index.get_handle(path)
183            .ok_or_else(|| anyhow::anyhow!("Path not found: {}", path))?;
184        
185        let metadata = handles_map.get_metadata(handle)
186            .ok_or_else(|| anyhow::anyhow!("Handle metadata not found: {}", handle))?;
187        
188        let store_path = self.collection_path.join("store");
189        let mut reader = SegmentReader::new(&store_path, metadata.seg_id)?;
190        let frame = reader.read_frame(metadata)?;
191        
192        let content_str = String::from_utf8(frame.content)
193            .map_err(|_| anyhow::anyhow!("File contains non-UTF8 content"))?;
194        
195        let newline_positions = decode_line_table(&frame.line_table)?;
196        let lines = extract_line_range(&content_str, &newline_positions, start_line, end_line)?;
197        
198        Ok(TextSpan {
199            path: path.to_string(),
200            content: lines,
201            start_line,
202            end_line,
203        })
204    }
205}
206
207/// Find all matches of a substring in content, returning byte offsets and line content
208fn find_matches_in_content(content: &str, query: &str) -> Vec<(usize, String)> {
209    let mut matches = Vec::new();
210    let lines: Vec<&str> = content.lines().collect();
211    let mut byte_offset = 0;
212    
213    for line in lines.iter() {
214        if line.contains(query) {
215            matches.push((byte_offset, line.to_string()));
216        }
217        byte_offset += line.len() + 1; // +1 for newline
218    }
219    
220    matches
221}
222
223/// Extract a range of lines from content
224fn extract_line_range(
225    content: &str, 
226    _newline_positions: &[u32], 
227    start_line: u32, 
228    end_line: u32
229) -> Result<String> {
230    let lines: Vec<&str> = content.lines().collect();
231    
232    if start_line == 0 || end_line == 0 {
233        anyhow::bail!("Line numbers must be 1-based");
234    }
235    
236    let start_idx = (start_line - 1) as usize;
237    let end_idx = std::cmp::min(end_line as usize, lines.len());
238    
239    if start_idx >= lines.len() {
240        anyhow::bail!("Start line {} exceeds file length {}", start_line, lines.len());
241    }
242    
243    let selected_lines = &lines[start_idx..end_idx];
244    Ok(selected_lines.join("\n"))
245}
246
247/// Simple glob matching (very basic implementation for MVP)
248#[allow(dead_code)]
249fn glob_match(pattern: &str, text: &str) -> bool {
250    if pattern == "**/*" || pattern == "*" {
251        return true;
252    }
253    
254    // For MVP, just support simple prefix/suffix matching
255    if pattern.starts_with("**/*.") {
256        let ext = &pattern[5..];
257        return text.ends_with(ext);
258    }
259    
260    if pattern.starts_with("**/") {
261        let suffix = &pattern[3..];
262        return text.contains(suffix);
263    }
264    
265    if pattern.ends_with("/**") {
266        let prefix = &pattern[..pattern.len() - 3];
267        return text.starts_with(prefix);
268    }
269    
270    // Exact match fallback
271    pattern == text
272}