Skip to main content

ix/
scanner.rs

1//! Fallback scanner (no index, competitive with ripgrep).
2//!
3//! Used when .ix index is missing or explicitly disabled.
4
5use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::format::is_binary;
8use crate::executor::{Match, QueryOptions};
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18pub struct Scanner {
19    root: PathBuf,
20}
21
22impl Scanner {
23    pub fn new(root: &Path) -> Self {
24        Self {
25            root: root.to_owned(),
26        }
27    }
28
29    pub fn scan(
30        &self,
31        pattern: &str,
32        is_regex: bool,
33        ignore_case: bool,
34        options: &QueryOptions,
35    ) -> Result<Vec<Match>> {
36        let raw = if is_regex {
37            pattern.to_string()
38        } else {
39            regex::escape(pattern)
40        };
41        let regex_pat = if ignore_case { format!("(?i){raw}") } else { raw };
42        let regex = Regex::new(&regex_pat)?;
43
44        let walker = WalkBuilder::new(&self.root)
45            .hidden(false)
46            .git_ignore(true)
47            .require_git(false)
48            .add_custom_ignore_filename(".ixignore")
49            .filter_entry(move |entry| {
50                let path = entry.path();
51                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
52                
53                // Built-in directory defaults
54                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
55                    && (name == "lost+found" || name == ".git" || name == "node_modules" || 
56                       name == "target" || name == "__pycache__" || name == ".tox" || 
57                       name == ".venv" || name == "venv" || name == ".ix") 
58                {
59                    return false;
60                }
61
62                // Built-in file extension defaults
63                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
64                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
65                    match ext {
66                        // Binary extensions
67                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
68                        // Media
69                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
70                        // Archives
71                        "zip" | "7z" | "rar" |
72                        // Data
73                        "sqlite" | "db" | "bin" => return false,
74                        _ => {}
75                    }
76                    if name.ends_with(".tar.gz") {
77                        return false;
78                    }
79                }
80                true
81            })
82            .build();
83
84        let paths: Vec<PathBuf> = walker
85            .filter_map(|result| {
86                match result {
87                    Ok(entry) => Some(entry),
88                    Err(e) => {
89                        eprintln!("ix: warning: scanner skipping path: {}", e);
90                        None
91                    }
92                }
93            })
94            .filter(|entry| entry.file_type().map(|t| t.is_file()).unwrap_or(false))
95            .map(|entry| entry.path().to_owned())
96            .collect();
97
98        let matches_found = AtomicU32::new(0);
99        let mut matches: Vec<Match> = paths
100            .into_par_iter()
101            .filter_map(|path| {
102                if options.max_results > 0
103                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32
104                {
105                    return None;
106                }
107
108                // Filter by extension
109                if !options.type_filter.is_empty() {
110                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
111                    if !options.type_filter.iter().any(|e: &String| e == ext) {
112                        return None;
113                    }
114                }
115
116                // Archive support
117                if options.archive {
118                    let _ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
119                    let _is_tar_gz = path.to_str().map(|s| s.ends_with(".tar.gz")).unwrap_or(false);
120
121                    #[cfg(feature = "archive")]
122                    {
123                        if _ext == "zip"
124                            && let Ok(archive_matches) = crate::archive::scan_zip(&path, &regex, options)
125                        {
126                            matches_found.fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
127                            return Some(archive_matches);
128                        } else if _is_tar_gz
129                            && let Ok(archive_matches) = crate::archive::scan_tar_gz(&path, &regex, options)
130                        {
131                            matches_found.fetch_add(archive_matches.len() as u32, Ordering::Relaxed);
132                            return Some(archive_matches);
133                        }
134                    }
135                }
136
137                let file_matches = self.scan_file(&path, &regex, options).ok()?;
138                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
139                Some(file_matches)
140            })
141            .flatten()
142            .collect();
143
144        if options.max_results > 0 && matches.len() > options.max_results {
145            matches.truncate(options.max_results);
146        }
147
148        Ok(matches)
149    }
150
151    fn scan_stream<R: Read>(
152        &self,
153        reader: R,
154        path: &Path,
155        regex: &Regex,
156        options: &QueryOptions,
157    ) -> Result<Vec<Match>> {
158        let mut buf_reader = BufReader::new(reader);
159        let mut matches = Vec::new();
160        let mut line_number = 0u32;
161        let mut byte_offset = 0u64;
162
163        // Binary check on first 8KB
164        {
165            let buffer = buf_reader.fill_buf()?;
166            if buffer.is_empty() {
167                return Ok(vec![]);
168            }
169            let is_bin = is_binary(buffer);
170            if is_bin && !options.binary {
171                return Ok(vec![]);
172            }
173        }
174
175        let mut line = String::new();
176        let mut context_before = std::collections::VecDeque::new();
177        let mut pending_matches: Vec<Match> = Vec::new();
178
179        while buf_reader.read_line(&mut line)? > 0 {
180            line_number += 1;
181            let line_len = line.len() as u64;
182            let trimmed_line = line.trim_end().to_string();
183
184            // Fill context_after for pending matches
185            for m in &mut pending_matches {
186                if m.context_after.len() < options.context_lines {
187                    m.context_after.push(trimmed_line.clone());
188                }
189            }
190
191            // Move completed matches to final list
192            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
193                .into_iter()
194                .partition(|m| m.context_after.len() >= options.context_lines);
195            matches.extend(completed);
196            pending_matches = still_pending;
197
198            if let Some(m) = regex.find(&line) {
199                let context_before_vec: Vec<String> =
200                    context_before.iter().map(|s: &String| s.trim_end().to_string()).collect();
201
202                let new_match = Match {
203                    file_path: path.to_owned(),
204                    line_number,
205                    col: (m.start() + 1) as u32,
206                    line_content: if options.count_only {
207                        String::new()
208                    } else {
209                        trimmed_line.clone()
210                    },
211                    byte_offset: byte_offset + m.start() as u64,
212                    context_before: context_before_vec,
213                    context_after: vec![],
214                    is_binary: false,
215                };
216
217                if options.context_lines > 0 {
218                    pending_matches.push(new_match);
219                } else {
220                    matches.push(new_match);
221                }
222
223                if options.max_results > 0
224                    && (matches.len() + pending_matches.len()) >= options.max_results
225                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
226                {
227                    break;
228                }
229            }
230
231            if options.context_lines > 0 {
232                context_before.push_back(line.clone());
233                if context_before.len() > options.context_lines {
234                    context_before.pop_front();
235                }
236            }
237
238            byte_offset += line_len;
239            line.clear();
240        }
241
242        matches.extend(pending_matches);
243        Ok(matches)
244    }
245
246    fn scan_file(
247        &self,
248        path: &Path,
249        regex: &Regex,
250        options: &QueryOptions,
251    ) -> Result<Vec<Match>> {
252        let file = File::open(path)?;
253        let metadata = file.metadata()?;
254        if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
255            // Keep 100MB limit for raw files to avoid huge mmaps in parallel
256            return Ok(vec![]);
257        }
258
259        let mmap = unsafe { Mmap::map(&file)? };
260
261        if options.decompress
262            && let Some(reader) = maybe_decompress(path, &mmap)? {
263            return self.scan_stream(reader, path, regex, options);
264        }
265
266        // Default to streaming via Cursor for uncompressed files to ensure constant memory (R-02)
267        self.scan_stream(Cursor::new(&mmap[..]), path, regex, options)
268    }
269}