1use crate::decompress::maybe_decompress;
6use crate::error::Result;
7use crate::executor::{Match, QueryOptions};
8use crate::format::is_binary;
9use ignore::WalkBuilder;
10use memmap2::Mmap;
11use rayon::prelude::*;
12use regex::Regex;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Cursor, Read};
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU32, Ordering};
17
18pub struct Scanner {
23 root: PathBuf,
24}
25
26impl Scanner {
27 #[must_use]
29 pub fn new(root: &Path) -> Self {
30 Self {
31 root: root.to_owned(),
32 }
33 }
34
35 #[allow(clippy::too_many_lines)]
42 pub fn scan(
43 &self,
44 pattern: &str,
45 is_regex: bool,
46 ignore_case: bool,
47 options: &QueryOptions,
48 ) -> Result<Vec<Match>> {
49 let raw = if is_regex {
50 pattern.to_string()
51 } else {
52 regex::escape(pattern)
53 };
54 let regex_pat = if ignore_case {
55 format!("(?i){raw}")
56 } else {
57 raw
58 };
59 let regex = Regex::new(®ex_pat)?;
60
61 let walker = WalkBuilder::new(&self.root)
62 .hidden(false)
63 .git_ignore(true)
64 .require_git(false)
65 .add_custom_ignore_filename(".ixignore")
66 .filter_entry(move |entry| {
67 let path = entry.path();
68 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
69
70 if entry.file_type().is_some_and(|t| t.is_dir())
72 && (name == "lost+found"
73 || name == ".git"
74 || name == "node_modules"
75 || name == "target"
76 || name == "__pycache__"
77 || name == ".tox"
78 || name == ".venv"
79 || name == "venv"
80 || name == ".ix")
81 {
82 return false;
83 }
84
85 if entry.file_type().is_some_and(|t| t.is_file()) {
87 if let Ok(metadata) = entry.metadata()
88 && metadata.len() > 10 * 1024 * 1024
89 {
90 return false;
91 }
92 if name == "Cargo.lock"
93 || name == "package-lock.json"
94 || name == "pnpm-lock.yaml"
95 || name == "shard.ix"
96 || name == "shard.ix.tmp"
97 {
98 return false;
99 }
100 }
101
102 if entry.file_type().is_some_and(|t| t.is_file()) {
104 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
105 match ext {
106 "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
108 "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
110 "zip" | "7z" | "rar" |
112 "sqlite" | "db" | "bin" => return false,
114 _ => {}
115 }
116 if name.ends_with(".tar.gz") {
117 return false;
118 }
119 }
120 true
121 })
122 .build();
123
124 let paths: Vec<PathBuf> = walker
125 .filter_map(|result| match result {
126 Ok(entry) => Some(entry),
127 Err(e) => {
128 eprintln!("ix: warning: scanner skipping path: {e}");
129 None
130 }
131 })
132 .filter(|entry| entry.file_type().is_some_and(|t| t.is_file()))
133 .map(|entry| entry.path().to_owned())
134 .collect();
135
136 let matches_found = AtomicU32::new(0);
137 let mut matches: Vec<Match> = paths
138 .into_par_iter()
139 .filter_map(|path| {
140 if options.max_results > 0
141 && matches_found.load(Ordering::Relaxed) >= u32::try_from(options.max_results).unwrap_or(0)
142 {
143 return None;
144 }
145
146 if !options.type_filter.is_empty() {
148 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
149 if !options.type_filter.iter().any(|e: &String| e == ext) {
150 return None;
151 }
152 }
153
154 if options.archive {
156 #[cfg(feature = "archive")]
157 {
158 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
159 let is_tar_gz = path
160 .to_str()
161 .is_some_and(|s| s.ends_with(".tar.gz"));
162 if ext == "zip"
163 && let Ok(archive_matches) =
164 crate::archive::scan_zip(&path, ®ex, options)
165 {
166 matches_found
167 .fetch_add(u32::try_from(archive_matches.len()).unwrap_or(0), Ordering::Relaxed);
168 return Some(archive_matches);
169 }
170 if is_tar_gz
171 && let Ok(archive_matches) =
172 crate::archive::scan_tar_gz(&path, ®ex, options)
173 {
174 matches_found
175 .fetch_add(u32::try_from(archive_matches.len()).unwrap_or(0), Ordering::Relaxed);
176 return Some(archive_matches);
177 }
178 }
179 }
180
181 let file_matches = Self::scan_file(&path, ®ex, options).ok()?;
182 matches_found.fetch_add(u32::try_from(file_matches.len()).unwrap_or(0), Ordering::Relaxed);
183 Some(file_matches)
184 })
185 .flatten()
186 .collect();
187
188 if options.max_results > 0 && matches.len() > options.max_results {
189 matches.truncate(options.max_results);
190 }
191
192 Ok(matches)
193 }
194
195 #[allow(clippy::too_many_lines)]
196 fn scan_stream<R: Read>(
197 reader: R,
198 path: &Path,
199 regex: &Regex,
200 options: &QueryOptions,
201 ) -> Result<Vec<Match>> {
202 let mut buf_reader = BufReader::new(reader);
203 let mut matches = Vec::new();
204 let mut line_number = 0u32;
205 let mut byte_offset = 0u64;
206
207 {
209 let buffer = buf_reader.fill_buf()?;
210 if buffer.is_empty() {
211 return Ok(vec![]);
212 }
213 let is_bin = is_binary(buffer);
214 if is_bin && !options.binary {
215 return Ok(vec![]);
216 }
217 }
218
219 let mut line = String::new();
220 let mut context_before = std::collections::VecDeque::new();
221 let mut pending_matches: Vec<Match> = Vec::new();
222
223 while buf_reader.read_line(&mut line)? > 0 {
224 line_number += 1;
225 let line_len = u64::try_from(line.len()).unwrap_or(0);
226 let trimmed_line = line.trim_end().to_string();
227
228 for m in &mut pending_matches {
230 if m.context_after.len() < options.context_lines {
231 m.context_after.push(trimmed_line.clone());
232 }
233 }
234
235 let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
237 .into_iter()
238 .partition(|m| m.context_after.len() >= options.context_lines);
239 matches.extend(completed);
240 pending_matches = still_pending;
241
242 if let Some(m) = regex.find(&line) {
243 let context_before_vec: Vec<String> = context_before
244 .iter()
245 .map(|s: &String| s.trim_end().to_string())
246 .collect();
247
248 let new_match = Match {
249 file_path: path.to_owned(),
250 line_number,
251 col: u32::try_from(m.start() + 1).unwrap_or(0),
252 line_content: if options.count_only {
253 String::new()
254 } else {
255 trimmed_line.clone()
256 },
257 byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(0),
258 context_before: context_before_vec,
259 context_after: vec![],
260 is_binary: false,
261 };
262
263 if options.context_lines > 0 {
264 pending_matches.push(new_match);
265 } else {
266 matches.push(new_match);
267 }
268
269 if options.max_results > 0
270 && (matches.len() + pending_matches.len()) >= options.max_results
271 && (pending_matches.is_empty() || matches.len() >= options.max_results)
272 {
273 break;
274 }
275 }
276
277 if options.context_lines > 0 {
278 context_before.push_back(line.clone());
279 if context_before.len() > options.context_lines {
280 context_before.pop_front();
281 }
282 }
283
284 byte_offset += line_len;
285 line.clear();
286 }
287
288 matches.extend(pending_matches);
289 Ok(matches)
290 }
291
292 fn scan_file(path: &Path, regex: &Regex, options: &QueryOptions) -> Result<Vec<Match>> {
293 let file = File::open(path)?;
294 let metadata = file.metadata()?;
295 if metadata.len() > 100 * 1024 * 1024 && !options.decompress {
296 return Ok(vec![]);
298 }
299
300 let mmap = unsafe { Mmap::map(&file)? };
301
302 if options.decompress
303 && let Some(reader) = maybe_decompress(path, &mmap)?
304 {
305 return Self::scan_stream(reader, path, regex, options);
306 }
307
308 Self::scan_stream(Cursor::new(&mmap[..]), path, regex, options)
310 }
311}