Skip to main content

ix/
scanner.rs

1//! Fallback scanner (no index, competitive with ripgrep).
2//!
3//! Used when .ix index is missing or explicitly disabled.
4
5use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::format::is_binary;
8use crate::executor::{Match, QueryOptions};
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18pub struct Scanner {
19    root: PathBuf,
20}
21
22impl Scanner {
23    pub fn new(root: &Path) -> Self {
24        Self {
25            root: root.to_owned(),
26        }
27    }
28
29    pub fn scan(
30        &self,
31        pattern: &str,
32        is_regex: bool,
33        ignore_case: bool,
34        options: &QueryOptions,
35    ) -> Result<Vec<Match>> {
36        let raw = if is_regex {
37            pattern.to_string()
38        } else {
39            regex::escape(pattern)
40        };
41        let regex_pat = if ignore_case { format!("(?i){raw}") } else { raw };
42        let regex = Regex::new(&regex_pat)?;
43
44        let walker = WalkBuilder::new(&self.root)
45            .hidden(false)
46            .git_ignore(true)
47            .require_git(false)
48            .add_custom_ignore_filename(".ixignore")
49            .filter_entry(move |entry| {
50                let path = entry.path();
51                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
52                
53                // Built-in directory defaults
54                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
55                    && (name == "lost+found" || name == ".git" || name == "node_modules" || 
56                       name == "target" || name == "__pycache__" || name == ".tox" || 
57                       name == ".venv" || name == "venv" || name == ".ix") 
58                {
59                    return false;
60                }
61
62                // Built-in file noise defaults
63                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
64                    if let Ok(metadata) = entry.metadata()
65                        && metadata.len() > 10 * 1024 * 1024
66                    {
67                        return false;
68                    }
69                    if name == "Cargo.lock" || name == "package-lock.json" || name == "pnpm-lock.yaml" || 
70                       name == "shard.ix" || name == "shard.ix.tmp"
71                    {
72                        return false;
73                    }
74                }
75
76                // Built-in file extension defaults
77                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
78                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
79                    match ext {
80                        // Binary extensions
81                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
82                        // Media
83                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
84                        // Archives
85                        "zip" | "7z" | "rar" |
86                        // Data
87                        "sqlite" | "db" | "bin" => return false,
88                        _ => {}
89                    }
90                    if name.ends_with(".tar.gz") {
91                        return false;
92                    }
93                }
94                true
95            })
96            .build();
97
98        let paths: Vec<PathBuf> = walker
99            .filter_map(|result| {
100                match result {
101                    Ok(entry) => Some(entry),
102                    Err(e) => {
103                        eprintln!("ix: warning: scanner skipping path: {}", e);
104                        None
105                    }
106                }
107            })
108            .filter(|entry| entry.file_type().map(|t| t.is_file()).unwrap_or(false))
109            .map(|entry| entry.path().to_owned())
110            .collect();
111
112        let matches_found = AtomicU32::new(0);
113        let mut matches: Vec<Match> = paths
114            .into_par_iter()
115            .filter_map(|path| {
116                if options.max_results > 0
117                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32
118                {
119                    return None;
120                }
121
122                // Filter by extension
123                if !options.type_filter.is_empty() {
124                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
125                    if !options.type_filter.iter().any(|e: &String| e == ext) {
126                        return None;
127                    }
128                }
129
130                // Archive support
131                if options.archive {
132                    let _ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
133                    let _is_tar_gz = path.to_str().map(|s| s.ends_with(".tar.gz")).unwrap_or(false);
134
135                    #[cfg(feature = "archive")]
136                    {
137                        if _ext == "zip"
138                            && let Ok(archive_matches) = crate::archive::scan_zip(&path, &regex, options)
139                        {
140                            matches_found.fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
141                            return Some(archive_matches);
142                        } else if _is_tar_gz
143                            && let Ok(archive_matches) = crate::archive::scan_tar_gz(&path, &regex, options)
144                        {
145                            matches_found.fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
146                            return Some(archive_matches);
147                        }
148                    }
149                }
150
151                let file_matches = self.scan_file(&path, &regex, options).ok()?;
152                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
153                Some(file_matches)
154            })
155            .flatten()
156            .collect();
157
158        if options.max_results > 0 && matches.len() > options.max_results {
159            matches.truncate(options.max_results);
160        }
161
162        Ok(matches)
163    }
164
165    fn scan_stream<R: Read>(
166        &self,
167        reader: R,
168        path: &Path,
169        regex: &Regex,
170        options: &QueryOptions,
171    ) -> Result<Vec<Match>> {
172        let mut buf_reader = BufReader::new(reader);
173        let mut matches = Vec::new();
174        let mut line_number = 0u32;
175        let mut byte_offset = 0u64;
176
177        // Binary check on first 8KB
178        {
179            let buffer = buf_reader.fill_buf()?;
180            if buffer.is_empty() {
181                return Ok(vec![]);
182            }
183            let is_bin = is_binary(buffer);
184            if is_bin && !options.binary {
185                return Ok(vec![]);
186            }
187        }
188
189        let mut line = String::new();
190        let mut context_before = std::collections::VecDeque::new();
191        let mut pending_matches: Vec<Match> = Vec::new();
192
193        while buf_reader.read_line(&mut line)? > 0 {
194            line_number += 1;
195            let line_len = line.len() as u64;
196            let trimmed_line = line.trim_end().to_string();
197
198            // Fill context_after for pending matches
199            for m in &mut pending_matches {
200                if m.context_after.len() < options.context_lines {
201                    m.context_after.push(trimmed_line.clone());
202                }
203            }
204
205            // Move completed matches to final list
206            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
207                .into_iter()
208                .partition(|m| m.context_after.len() >= options.context_lines);
209            matches.extend(completed);
210            pending_matches = still_pending;
211
212            if let Some(m) = regex.find(&line) {
213                let context_before_vec: Vec<String> =
214                    context_before.iter().map(|s: &String| s.trim_end().to_string()).collect();
215
216                let new_match = Match {
217                    file_path: path.to_owned(),
218                    line_number,
219                    col: (m.start() + 1) as u32,
220                    line_content: if options.count_only {
221                        String::new()
222                    } else {
223                        trimmed_line.clone()
224                    },
225                    byte_offset: byte_offset + m.start() as u64,
226                    context_before: context_before_vec,
227                    context_after: vec![],
228                    is_binary: false,
229                };
230
231                if options.context_lines > 0 {
232                    pending_matches.push(new_match);
233                } else {
234                    matches.push(new_match);
235                }
236
237                if options.max_results > 0
238                    && (matches.len() + pending_matches.len()) >= options.max_results
239                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
240                {
241                    break;
242                }
243            }
244
245            if options.context_lines > 0 {
246                context_before.push_back(line.clone());
247                if context_before.len() > options.context_lines {
248                    context_before.pop_front();
249                }
250            }
251
252            byte_offset += line_len;
253            line.clear();
254        }
255
256        matches.extend(pending_matches);
257        Ok(matches)
258    }
259
260    fn scan_file(
261        &self,
262        path: &Path,
263        regex: &Regex,
264        options: &QueryOptions,
265    ) -> Result<Vec<Match>> {
266        let file = File::open(path)?;
267        let metadata = file.metadata()?;
268        if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
269            // Keep 100MB limit for raw files to avoid huge mmaps in parallel
270            return Ok(vec![]);
271        }
272
273        let mmap = unsafe { Mmap::map(&file)? };
274
275        if options.decompress
276            && let Some(reader) = maybe_decompress(path, &mmap)? {
277            return self.scan_stream(reader, path, regex, options);
278        }
279
280        // Default to streaming via Cursor for uncompressed files to ensure constant memory (R-02)
281        self.scan_stream(Cursor::new(&mmap[..]), path, regex, options)
282    }
283}