Skip to main content

ix/
scanner.rs

1//! Fallback scanner (no index, competitive with ripgrep).
2//!
3//! Used when .ix index is missing or explicitly disabled.
4
5use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::executor::{Match, QueryOptions};
8use crate::format::is_binary;
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18pub struct Scanner {
19    root: PathBuf,
20}
21
22impl Scanner {
23    pub fn new(root: &Path) -> Self {
24        Self {
25            root: root.to_owned(),
26        }
27    }
28
29    pub fn scan(
30        &self,
31        pattern: &str,
32        is_regex: bool,
33        ignore_case: bool,
34        options: &QueryOptions,
35    ) -> Result<Vec<Match>> {
36        let raw = if is_regex {
37            pattern.to_string()
38        } else {
39            regex::escape(pattern)
40        };
41        let regex_pat = if ignore_case {
42            format!("(?i){raw}")
43        } else {
44            raw
45        };
46        let regex = Regex::new(&regex_pat)?;
47
48        let walker = WalkBuilder::new(&self.root)
49            .hidden(false)
50            .git_ignore(true)
51            .require_git(false)
52            .add_custom_ignore_filename(".ixignore")
53            .filter_entry(move |entry| {
54                let path = entry.path();
55                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
56
57                // Built-in directory defaults
58                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
59                    && (name == "lost+found"
60                        || name == ".git"
61                        || name == "node_modules"
62                        || name == "target"
63                        || name == "__pycache__"
64                        || name == ".tox"
65                        || name == ".venv"
66                        || name == "venv"
67                        || name == ".ix")
68                {
69                    return false;
70                }
71
72                // Built-in file noise defaults
73                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
74                    if let Ok(metadata) = entry.metadata()
75                        && metadata.len() > 10 * 1024 * 1024
76                    {
77                        return false;
78                    }
79                    if name == "Cargo.lock"
80                        || name == "package-lock.json"
81                        || name == "pnpm-lock.yaml"
82                        || name == "shard.ix"
83                        || name == "shard.ix.tmp"
84                    {
85                        return false;
86                    }
87                }
88
89                // Built-in file extension defaults
90                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
91                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
92                    match ext {
93                        // Binary extensions
94                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
95                        // Media
96                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
97                        // Archives
98                        "zip" | "7z" | "rar" |
99                        // Data
100                        "sqlite" | "db" | "bin" => return false,
101                        _ => {}
102                    }
103                    if name.ends_with(".tar.gz") {
104                        return false;
105                    }
106                }
107                true
108            })
109            .build();
110
111        let paths: Vec<PathBuf> = walker
112            .filter_map(|result| match result {
113                Ok(entry) => Some(entry),
114                Err(e) => {
115                    eprintln!("ix: warning: scanner skipping path: {}", e);
116                    None
117                }
118            })
119            .filter(|entry| entry.file_type().map(|t| t.is_file()).unwrap_or(false))
120            .map(|entry| entry.path().to_owned())
121            .collect();
122
123        let matches_found = AtomicU32::new(0);
124        let mut matches: Vec<Match> = paths
125            .into_par_iter()
126            .filter_map(|path| {
127                if options.max_results > 0
128                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32
129                {
130                    return None;
131                }
132
133                // Filter by extension
134                if !options.type_filter.is_empty() {
135                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
136                    if !options.type_filter.iter().any(|e: &String| e == ext) {
137                        return None;
138                    }
139                }
140
141                // Archive support
142                if options.archive {
143                    let _ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
144                    let _is_tar_gz = path
145                        .to_str()
146                        .map(|s| s.ends_with(".tar.gz"))
147                        .unwrap_or(false);
148
149                    #[cfg(feature = "archive")]
150                    {
151                        if _ext == "zip"
152                            && let Ok(archive_matches) =
153                                crate::archive::scan_zip(&path, &regex, options)
154                        {
155                            matches_found
156                                .fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
157                            return Some(archive_matches);
158                        } else if _is_tar_gz
159                            && let Ok(archive_matches) =
160                                crate::archive::scan_tar_gz(&path, &regex, options)
161                        {
162                            matches_found
163                                .fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
164                            return Some(archive_matches);
165                        }
166                    }
167                }
168
169                let file_matches = self.scan_file(&path, &regex, options).ok()?;
170                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
171                Some(file_matches)
172            })
173            .flatten()
174            .collect();
175
176        if options.max_results > 0 && matches.len() > options.max_results {
177            matches.truncate(options.max_results);
178        }
179
180        Ok(matches)
181    }
182
183    fn scan_stream<R: Read>(
184        &self,
185        reader: R,
186        path: &Path,
187        regex: &Regex,
188        options: &QueryOptions,
189    ) -> Result<Vec<Match>> {
190        let mut buf_reader = BufReader::new(reader);
191        let mut matches = Vec::new();
192        let mut line_number = 0u32;
193        let mut byte_offset = 0u64;
194
195        // Binary check on first 8KB
196        {
197            let buffer = buf_reader.fill_buf()?;
198            if buffer.is_empty() {
199                return Ok(vec![]);
200            }
201            let is_bin = is_binary(buffer);
202            if is_bin && !options.binary {
203                return Ok(vec![]);
204            }
205        }
206
207        let mut line = String::new();
208        let mut context_before = std::collections::VecDeque::new();
209        let mut pending_matches: Vec<Match> = Vec::new();
210
211        while buf_reader.read_line(&mut line)? > 0 {
212            line_number += 1;
213            let line_len = line.len() as u64;
214            let trimmed_line = line.trim_end().to_string();
215
216            // Fill context_after for pending matches
217            for m in &mut pending_matches {
218                if m.context_after.len() < options.context_lines {
219                    m.context_after.push(trimmed_line.clone());
220                }
221            }
222
223            // Move completed matches to final list
224            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
225                .into_iter()
226                .partition(|m| m.context_after.len() >= options.context_lines);
227            matches.extend(completed);
228            pending_matches = still_pending;
229
230            if let Some(m) = regex.find(&line) {
231                let context_before_vec: Vec<String> = context_before
232                    .iter()
233                    .map(|s: &String| s.trim_end().to_string())
234                    .collect();
235
236                let new_match = Match {
237                    file_path: path.to_owned(),
238                    line_number,
239                    col: (m.start() + 1) as u32,
240                    line_content: if options.count_only {
241                        String::new()
242                    } else {
243                        trimmed_line.clone()
244                    },
245                    byte_offset: byte_offset + m.start() as u64,
246                    context_before: context_before_vec,
247                    context_after: vec![],
248                    is_binary: false,
249                };
250
251                if options.context_lines > 0 {
252                    pending_matches.push(new_match);
253                } else {
254                    matches.push(new_match);
255                }
256
257                if options.max_results > 0
258                    && (matches.len() + pending_matches.len()) >= options.max_results
259                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
260                {
261                    break;
262                }
263            }
264
265            if options.context_lines > 0 {
266                context_before.push_back(line.clone());
267                if context_before.len() > options.context_lines {
268                    context_before.pop_front();
269                }
270            }
271
272            byte_offset += line_len;
273            line.clear();
274        }
275
276        matches.extend(pending_matches);
277        Ok(matches)
278    }
279
280    fn scan_file(&self, path: &Path, regex: &Regex, options: &QueryOptions) -> Result<Vec<Match>> {
281        let file = File::open(path)?;
282        let metadata = file.metadata()?;
283        if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
284            // Keep 100MB limit for raw files to avoid huge mmaps in parallel
285            return Ok(vec![]);
286        }
287
288        let mmap = unsafe { Mmap::map(&file)? };
289
290        if options.decompress
291            && let Some(reader) = maybe_decompress(path, &mmap)?
292        {
293            return self.scan_stream(reader, path, regex, options);
294        }
295
296        // Default to streaming via Cursor for uncompressed files to ensure constant memory (R-02)
297        self.scan_stream(Cursor::new(&mmap[..]), path, regex, options)
298    }
299}