Skip to main content

ix/
scanner.rs

1//! Fallback scanner (no index, competitive with ripgrep).
2//!
3//! Used when .ix index is missing or explicitly disabled.
4
5use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::executor::{Match, QueryOptions};
8use crate::format::is_binary;
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18/// Fallback scanner that reads files directly (no index).
19///
20/// Used when the `.ix` index is missing or explicitly disabled. Walks the
21/// filesystem and applies regex matching in parallel via Rayon.
22pub struct Scanner {
23    root: PathBuf,
24}
25
26impl Scanner {
27    /// Create a new scanner rooted at `root`.
28    #[must_use]
29    pub fn new(root: &Path) -> Self {
30        Self {
31            root: root.to_owned(),
32        }
33    }
34
35    /// Scan files in the scanner's root directory for `pattern`.
36    ///
37    /// # Errors
38    ///
39    /// Returns an error if the regex is invalid or if file I/O fails during
40    /// the walk or content reading.
41    #[allow(clippy::too_many_lines)]
42    pub fn scan(
43        &self,
44        pattern: &str,
45        is_regex: bool,
46        ignore_case: bool,
47        options: &QueryOptions,
48    ) -> Result<Vec<Match>> {
49        let raw = if is_regex {
50            pattern.to_string()
51        } else {
52            regex::escape(pattern)
53        };
54
55        // Apply word boundary wrapping for literal patterns (same as planner.rs)
56        let with_word_boundaries = if options.word_boundary && !is_regex {
57            format!("\\b{raw}\\b")
58        } else {
59            raw
60        };
61
62        // Build regex pattern with flags
63        let mut regex_pat = String::new();
64        if ignore_case {
65            regex_pat.push_str("(?i)");
66        }
67        if options.multiline {
68            regex_pat.push_str("(?s)");
69        }
70        regex_pat.push_str(&with_word_boundaries);
71
72        let regex = Regex::new(&regex_pat)?;
73
74        let walker = WalkBuilder::new(&self.root)
75            .hidden(false)
76            .git_ignore(true)
77            .require_git(false)
78            .add_custom_ignore_filename(".ixignore")
79            .filter_entry(move |entry| {
80                let path = entry.path();
81                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
82
83                // Built-in directory defaults
84                if entry.file_type().is_some_and(|t| t.is_dir())
85                    && (name == "lost+found"
86                        || name == ".git"
87                        || name == "node_modules"
88                        || name == "target"
89                        || name == "__pycache__"
90                        || name == ".tox"
91                        || name == ".venv"
92                        || name == "venv"
93                        || name == ".ix")
94                {
95                    return false;
96                }
97
98                // Built-in file noise defaults
99                if entry.file_type().is_some_and(|t| t.is_file()) {
100                    if let Ok(metadata) = entry.metadata()
101                        && metadata.len() > 10 * 1024 * 1024
102                    {
103                        return false;
104                    }
105                    if name == "Cargo.lock"
106                        || name == "package-lock.json"
107                        || name == "pnpm-lock.yaml"
108                        || name == "shard.ix"
109                        || name == "shard.ix.tmp"
110                    {
111                        return false;
112                    }
113                }
114
115                // Built-in file extension defaults
116                if entry.file_type().is_some_and(|t| t.is_file()) {
117                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
118                    match ext {
119                        // Binary extensions
120                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
121                        // Media
122                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
123                        // Archives
124                        "zip" | "7z" | "rar" |
125                        // Data
126                        "sqlite" | "db" | "bin" => return false,
127                        _ => {}
128                    }
129                    if name.ends_with(".tar.gz") {
130                        return false;
131                    }
132                }
133                true
134            })
135            .build();
136
137        let paths: Vec<PathBuf> = walker
138            .filter_map(|result| match result {
139                Ok(entry) => Some(entry),
140                Err(e) => {
141                    eprintln!("ix: warning: scanner skipping path: {e}");
142                    None
143                }
144            })
145            .filter(|entry| entry.file_type().is_some_and(|t| t.is_file()))
146            .map(|entry| entry.path().to_owned())
147            .collect();
148
149        let matches_found = AtomicU32::new(0);
150        let mut matches: Vec<Match> = paths
151            .into_par_iter()
152            .filter_map(|path| {
153                if options.max_results > 0
154                    && matches_found.load(Ordering::Relaxed)
155                        >= u32::try_from(options.max_results).unwrap_or(0)
156                {
157                    return None;
158                }
159
160                // Filter by extension
161                if !options.type_filter.is_empty() {
162                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
163                    if !options.type_filter.iter().any(|e: &String| e == ext) {
164                        return None;
165                    }
166                }
167
168                // Archive support
169                if options.archive {
170                    #[cfg(feature = "archive")]
171                    {
172                        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
173                        let is_tar_gz = path.to_str().is_some_and(|s| s.ends_with(".tar.gz"));
174                        if ext == "zip"
175                            && let Ok(archive_matches) =
176                                crate::archive::scan_zip(&path, &regex, options)
177                        {
178                            matches_found.fetch_add(
179                                u32::try_from(archive_matches.len()).unwrap_or(0),
180                                Ordering::Relaxed,
181                            );
182                            return Some(archive_matches);
183                        }
184                        if is_tar_gz
185                            && let Ok(archive_matches) =
186                                crate::archive::scan_tar_gz(&path, &regex, options)
187                        {
188                            matches_found.fetch_add(
189                                u32::try_from(archive_matches.len()).unwrap_or(0),
190                                Ordering::Relaxed,
191                            );
192                            return Some(archive_matches);
193                        }
194                    }
195                }
196
197                let file_matches = Self::scan_file(&path, &regex, options).ok()?;
198                matches_found.fetch_add(
199                    u32::try_from(file_matches.len()).unwrap_or(0),
200                    Ordering::Relaxed,
201                );
202                Some(file_matches)
203            })
204            .flatten()
205            .collect();
206
207        if options.max_results > 0 && matches.len() > options.max_results {
208            matches.truncate(options.max_results);
209        }
210
211        Ok(matches)
212    }
213
214    #[allow(clippy::too_many_lines)]
215    fn scan_stream<R: Read>(
216        reader: R,
217        path: &Path,
218        regex: &Regex,
219        options: &QueryOptions,
220    ) -> Result<Vec<Match>> {
221        let mut buf_reader = BufReader::new(reader);
222        let mut matches = Vec::new();
223        let mut line_number = 0u32;
224        let mut byte_offset = 0u64;
225
226        // Binary check on first 8KB
227        {
228            let buffer = buf_reader.fill_buf()?;
229            if buffer.is_empty() {
230                return Ok(vec![]);
231            }
232            let is_bin = is_binary(buffer);
233            if is_bin && !options.binary {
234                return Ok(vec![]);
235            }
236        }
237
238        let mut line = String::new();
239        let mut context_before = std::collections::VecDeque::new();
240        let mut pending_matches: Vec<Match> = Vec::new();
241
242        while buf_reader.read_line(&mut line)? > 0 {
243            line_number += 1;
244            let line_len = u64::try_from(line.len()).unwrap_or(0);
245            let trimmed_line = line.trim_end().to_string();
246
247            // Fill context_after for pending matches
248            for m in &mut pending_matches {
249                if m.context_after.len() < options.context_lines {
250                    m.context_after.push(trimmed_line.clone());
251                }
252            }
253
254            // Move completed matches to final list
255            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
256                .into_iter()
257                .partition(|m| m.context_after.len() >= options.context_lines);
258            matches.extend(completed);
259            pending_matches = still_pending;
260
261            if let Some(m) = regex.find(&line) {
262                let context_before_vec: Vec<String> = context_before
263                    .iter()
264                    .map(|s: &String| s.trim_end().to_string())
265                    .collect();
266
267                let new_match = Match {
268                    file_path: path.to_owned(),
269                    line_number,
270                    col: u32::try_from(m.start() + 1).unwrap_or(0),
271                    line_content: if options.count_only {
272                        String::new()
273                    } else {
274                        trimmed_line.clone()
275                    },
276                    byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(0),
277                    context_before: context_before_vec,
278                    context_after: vec![],
279                    is_binary: false,
280                };
281
282                if options.context_lines > 0 {
283                    pending_matches.push(new_match);
284                } else {
285                    matches.push(new_match);
286                }
287
288                if options.max_results > 0
289                    && (matches.len() + pending_matches.len()) >= options.max_results
290                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
291                {
292                    break;
293                }
294            }
295
296            if options.context_lines > 0 {
297                context_before.push_back(line.clone());
298                if context_before.len() > options.context_lines {
299                    context_before.pop_front();
300                }
301            }
302
303            byte_offset += line_len;
304            line.clear();
305        }
306
307        matches.extend(pending_matches);
308        Ok(matches)
309    }
310
311    fn scan_file(path: &Path, regex: &Regex, options: &QueryOptions) -> Result<Vec<Match>> {
312        let file = File::open(path)?;
313        let metadata = file.metadata()?;
314        if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
315            // Keep 100MB limit for raw files to avoid huge mmaps in parallel
316            return Ok(vec![]);
317        }
318
319        let mmap = unsafe { Mmap::map(&file)? };
320
321        if options.decompress
322            && let Some(reader) = maybe_decompress(path, &mmap)?
323        {
324            return Self::scan_stream(reader, path, regex, options);
325        }
326
327        // Default to streaming via Cursor for uncompressed files to ensure constant memory (R-02)
328        Self::scan_stream(Cursor::new(&mmap[..]), path, regex, options)
329    }
330}