Skip to main content

ix/
scanner.rs

1//! Fallback scanner (no index, competitive with ripgrep).
2//!
3//! Used when .ix index is missing or explicitly disabled.
4
5use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::executor::{Match, QueryOptions};
8use crate::format::is_binary;
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18/// Fallback scanner that reads files directly (no index).
19///
20/// Used when the `.ix` index is missing or explicitly disabled. Walks the
21/// filesystem and applies regex matching in parallel via Rayon.
22pub struct Scanner {
23    root: PathBuf,
24}
25
26impl Scanner {
27    /// Create a new scanner rooted at `root`.
28    #[must_use]
29    pub fn new(root: &Path) -> Self {
30        Self {
31            root: root.to_owned(),
32        }
33    }
34
35    /// Scan files in the scanner's root directory for `pattern`.
36    ///
37    /// # Errors
38    ///
39    /// Returns an error if the regex is invalid or if file I/O fails during
40    /// the walk or content reading.
41    #[allow(clippy::too_many_lines)]
42    pub fn scan(
43        &self,
44        pattern: &str,
45        is_regex: bool,
46        ignore_case: bool,
47        options: &QueryOptions,
48    ) -> Result<Vec<Match>> {
49        let raw = if is_regex {
50            pattern.to_string()
51        } else {
52            regex::escape(pattern)
53        };
54        let regex_pat = if ignore_case {
55            format!("(?i){raw}")
56        } else {
57            raw
58        };
59        let regex = Regex::new(&regex_pat)?;
60
61        let walker = WalkBuilder::new(&self.root)
62            .hidden(false)
63            .git_ignore(true)
64            .require_git(false)
65            .add_custom_ignore_filename(".ixignore")
66            .filter_entry(move |entry| {
67                let path = entry.path();
68                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
69
70                // Built-in directory defaults
71                if entry.file_type().is_some_and(|t| t.is_dir())
72                    && (name == "lost+found"
73                        || name == ".git"
74                        || name == "node_modules"
75                        || name == "target"
76                        || name == "__pycache__"
77                        || name == ".tox"
78                        || name == ".venv"
79                        || name == "venv"
80                        || name == ".ix")
81                {
82                    return false;
83                }
84
85                // Built-in file noise defaults
86                if entry.file_type().is_some_and(|t| t.is_file()) {
87                    if let Ok(metadata) = entry.metadata()
88                        && metadata.len() > 10 * 1024 * 1024
89                    {
90                        return false;
91                    }
92                    if name == "Cargo.lock"
93                        || name == "package-lock.json"
94                        || name == "pnpm-lock.yaml"
95                        || name == "shard.ix"
96                        || name == "shard.ix.tmp"
97                    {
98                        return false;
99                    }
100                }
101
102                // Built-in file extension defaults
103                if entry.file_type().is_some_and(|t| t.is_file()) {
104                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
105                    match ext {
106                        // Binary extensions
107                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
108                        // Media
109                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
110                        // Archives
111                        "zip" | "7z" | "rar" |
112                        // Data
113                        "sqlite" | "db" | "bin" => return false,
114                        _ => {}
115                    }
116                    if name.ends_with(".tar.gz") {
117                        return false;
118                    }
119                }
120                true
121            })
122            .build();
123
124        let paths: Vec<PathBuf> = walker
125            .filter_map(|result| match result {
126                Ok(entry) => Some(entry),
127                Err(e) => {
128                    eprintln!("ix: warning: scanner skipping path: {e}");
129                    None
130                }
131            })
132            .filter(|entry| entry.file_type().is_some_and(|t| t.is_file()))
133            .map(|entry| entry.path().to_owned())
134            .collect();
135
136        let matches_found = AtomicU32::new(0);
137        let mut matches: Vec<Match> = paths
138            .into_par_iter()
139            .filter_map(|path| {
140                if options.max_results > 0
141                    && matches_found.load(Ordering::Relaxed) >= u32::try_from(options.max_results).unwrap_or(0)
142                {
143                    return None;
144                }
145
146                // Filter by extension
147                if !options.type_filter.is_empty() {
148                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
149                    if !options.type_filter.iter().any(|e: &String| e == ext) {
150                        return None;
151                    }
152                }
153
154                // Archive support
155                if options.archive {
156                    #[cfg(feature = "archive")]
157                    {
158                        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
159                        let is_tar_gz = path
160                            .to_str()
161                            .is_some_and(|s| s.ends_with(".tar.gz"));
162                        if ext == "zip"
163                            && let Ok(archive_matches) =
164                                crate::archive::scan_zip(&path, &regex, options)
165                        {
166                            matches_found
167                                .fetch_add(u32::try_from(archive_matches.len()).unwrap_or(0), Ordering::Relaxed);
168                            return Some(archive_matches);
169                        }
170                        if is_tar_gz
171                            && let Ok(archive_matches) =
172                                crate::archive::scan_tar_gz(&path, &regex, options)
173                        {
174                            matches_found
175                                .fetch_add(u32::try_from(archive_matches.len()).unwrap_or(0), Ordering::Relaxed);
176                            return Some(archive_matches);
177                        }
178                    }
179                }
180
181                let file_matches = Self::scan_file(&path, &regex, options).ok()?;
182                matches_found.fetch_add(u32::try_from(file_matches.len()).unwrap_or(0), Ordering::Relaxed);
183                Some(file_matches)
184            })
185            .flatten()
186            .collect();
187
188        if options.max_results > 0 && matches.len() > options.max_results {
189            matches.truncate(options.max_results);
190        }
191
192        Ok(matches)
193    }
194
195    #[allow(clippy::too_many_lines)]
196    fn scan_stream<R: Read>(
197        reader: R,
198        path: &Path,
199        regex: &Regex,
200        options: &QueryOptions,
201    ) -> Result<Vec<Match>> {
202        let mut buf_reader = BufReader::new(reader);
203        let mut matches = Vec::new();
204        let mut line_number = 0u32;
205        let mut byte_offset = 0u64;
206
207        // Binary check on first 8KB
208        {
209            let buffer = buf_reader.fill_buf()?;
210            if buffer.is_empty() {
211                return Ok(vec![]);
212            }
213            let is_bin = is_binary(buffer);
214            if is_bin && !options.binary {
215                return Ok(vec![]);
216            }
217        }
218
219        let mut line = String::new();
220        let mut context_before = std::collections::VecDeque::new();
221        let mut pending_matches: Vec<Match> = Vec::new();
222
223        while buf_reader.read_line(&mut line)? > 0 {
224            line_number += 1;
225            let line_len = u64::try_from(line.len()).unwrap_or(0);
226            let trimmed_line = line.trim_end().to_string();
227
228            // Fill context_after for pending matches
229            for m in &mut pending_matches {
230                if m.context_after.len() < options.context_lines {
231                    m.context_after.push(trimmed_line.clone());
232                }
233            }
234
235            // Move completed matches to final list
236            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
237                .into_iter()
238                .partition(|m| m.context_after.len() >= options.context_lines);
239            matches.extend(completed);
240            pending_matches = still_pending;
241
242            if let Some(m) = regex.find(&line) {
243                let context_before_vec: Vec<String> = context_before
244                    .iter()
245                    .map(|s: &String| s.trim_end().to_string())
246                    .collect();
247
248                let new_match = Match {
249                    file_path: path.to_owned(),
250                    line_number,
251                    col: u32::try_from(m.start() + 1).unwrap_or(0),
252                    line_content: if options.count_only {
253                        String::new()
254                    } else {
255                        trimmed_line.clone()
256                    },
257                    byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(0),
258                    context_before: context_before_vec,
259                    context_after: vec![],
260                    is_binary: false,
261                };
262
263                if options.context_lines > 0 {
264                    pending_matches.push(new_match);
265                } else {
266                    matches.push(new_match);
267                }
268
269                if options.max_results > 0
270                    && (matches.len() + pending_matches.len()) >= options.max_results
271                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
272                {
273                    break;
274                }
275            }
276
277            if options.context_lines > 0 {
278                context_before.push_back(line.clone());
279                if context_before.len() > options.context_lines {
280                    context_before.pop_front();
281                }
282            }
283
284            byte_offset += line_len;
285            line.clear();
286        }
287
288        matches.extend(pending_matches);
289        Ok(matches)
290    }
291
292    fn scan_file(path: &Path, regex: &Regex, options: &QueryOptions) -> Result<Vec<Match>> {
293        let file = File::open(path)?;
294        let metadata = file.metadata()?;
295        if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
296            // Keep 100MB limit for raw files to avoid huge mmaps in parallel
297            return Ok(vec![]);
298        }
299
300        let mmap = unsafe { Mmap::map(&file)? };
301
302        if options.decompress
303            && let Some(reader) = maybe_decompress(path, &mmap)?
304        {
305            return Self::scan_stream(reader, path, regex, options);
306        }
307
308        // Default to streaming via Cursor for uncompressed files to ensure constant memory (R-02)
309        Self::scan_stream(Cursor::new(&mmap[..]), path, regex, options)
310    }
311}