1use rustc_hash::FxHashSet;
2use std::fmt::Debug;
3use std::fs::File;
4use std::io::BufRead;
5use std::io::BufReader;
6use std::io::Read;
7use std::path::Path;
8use std::path::PathBuf;
9
10use ignore::{overrides::Override, types::TypesBuilder, WalkBuilder};
11use lazy_static::lazy_static;
12use tracing::{debug, warn};
13
14use crate::strings::{
15 proportion_of_printable_ascii_characters, PRINTABLE_ASCII_THRESHOLD,
16};
17use crate::threads::default_num_threads;
18
19pub struct PartialReadResult {
20 pub lines: Vec<String>,
21 pub bytes_read: usize,
22}
23
24pub enum ReadResult {
25 Partial(PartialReadResult),
26 Full(Vec<String>),
27 Error(String),
28}
29
30pub fn read_into_lines_capped<R>(r: R, max_bytes: usize) -> ReadResult
31where
32 R: Read,
33{
34 let mut buf_reader = BufReader::new(r);
35 let mut line = String::new();
36 let mut lines = Vec::new();
37 let mut bytes_read = 0;
38
39 loop {
40 line.clear();
41 match buf_reader.read_line(&mut line) {
42 Ok(0) => break,
43 Ok(_) => {
44 if bytes_read > max_bytes {
45 break;
46 }
47 lines.push(line.trim_end().to_string());
48 bytes_read += line.len();
49 }
50 Err(e) => {
51 warn!("Error reading file: {:?}", e);
52 return ReadResult::Error(format!("{e:?}"));
53 }
54 }
55 }
56
57 if bytes_read > max_bytes {
58 ReadResult::Partial(PartialReadResult { lines, bytes_read })
59 } else {
60 ReadResult::Full(lines)
61 }
62}
63
64lazy_static::lazy_static! {
65 pub static ref DEFAULT_NUM_THREADS: usize = default_num_threads().into();
66}
67
68pub fn walk_builder(
69 path: &Path,
70 n_threads: usize,
71 overrides: Option<Override>,
72 ignore_paths: Option<Vec<PathBuf>>,
73) -> WalkBuilder {
74 let mut builder = WalkBuilder::new(path);
75
76 let mut types_builder = TypesBuilder::new();
78 types_builder.add_defaults();
79 builder.types(types_builder.build().unwrap());
80
81 if let Some(paths) = ignore_paths {
83 builder.filter_entry(move |e| {
84 let path = e.path();
85 if paths.iter().any(|p| path.starts_with(p)) {
86 debug!("Ignoring path: {:?}", path);
87 return false;
88 }
89 true
90 });
91 }
92
93 builder.threads(n_threads);
94 if let Some(ov) = overrides {
95 builder.overrides(ov);
96 }
97 builder
98}
99
100pub fn get_file_size(path: &Path) -> Option<u64> {
101 std::fs::metadata(path).ok().map(|m| m.len())
102}
103
104#[derive(Debug)]
105pub enum FileType {
106 Text,
107 Other,
108 Unknown,
109}
110
111impl<P> From<P> for FileType
112where
113 P: AsRef<Path> + Debug,
114{
115 fn from(path: P) -> Self {
116 debug!("Getting file type for {:?}", path);
117 let p = path.as_ref();
118 if is_known_text_extension(p) {
119 return FileType::Text;
120 }
121 if let Ok(mut f) = File::open(p) {
122 let mut buffer = [0u8; 256];
123 if let Ok(bytes_read) = f.read(&mut buffer) {
124 if bytes_read > 0
125 && proportion_of_printable_ascii_characters(
126 &buffer[..bytes_read],
127 ) > PRINTABLE_ASCII_THRESHOLD
128 {
129 return FileType::Text;
130 }
131 }
132 } else {
133 warn!("Error opening file: {:?}", path);
134 }
135 FileType::Other
136 }
137}
138
139pub fn is_known_text_extension<P>(path: P) -> bool
140where
141 P: AsRef<Path>,
142{
143 path.as_ref()
144 .extension()
145 .and_then(|ext| ext.to_str())
146 .is_some_and(|ext| KNOWN_TEXT_FILE_EXTENSIONS.contains(ext))
147}
148
149lazy_static! {
150 static ref KNOWN_TEXT_FILE_EXTENSIONS: FxHashSet<&'static str> = [
151 "ada",
152 "adb",
153 "ads",
154 "applescript",
155 "as",
156 "asc",
157 "ascii",
158 "ascx",
159 "asm",
160 "asmx",
161 "asp",
162 "aspx",
163 "atom",
164 "au3",
165 "awk",
166 "bas",
167 "bash",
168 "bashrc",
169 "bat",
170 "bbcolors",
171 "bcp",
172 "bdsgroup",
173 "bdsproj",
174 "bib",
175 "bowerrc",
176 "c",
177 "cbl",
178 "cc",
179 "cfc",
180 "cfg",
181 "cfm",
182 "cfml",
183 "cgi",
184 "cjs",
185 "clj",
186 "cljs",
187 "cls",
188 "cmake",
189 "cmd",
190 "cnf",
191 "cob",
192 "code-snippets",
193 "coffee",
194 "coffeekup",
195 "conf",
196 "cp",
197 "cpp",
198 "cpt",
199 "cpy",
200 "crt",
201 "cs",
202 "csh",
203 "cson",
204 "csproj",
205 "csr",
206 "css",
207 "csslintrc",
208 "csv",
209 "ctl",
210 "curlrc",
211 "cxx",
212 "d",
213 "dart",
214 "dfm",
215 "diff",
216 "dof",
217 "dpk",
218 "dpr",
219 "dproj",
220 "dtd",
221 "eco",
222 "editorconfig",
223 "ejs",
224 "el",
225 "elm",
226 "emacs",
227 "eml",
228 "ent",
229 "erb",
230 "erl",
231 "eslintignore",
232 "eslintrc",
233 "ex",
234 "exs",
235 "f",
236 "f03",
237 "f77",
238 "f90",
239 "f95",
240 "fish",
241 "for",
242 "fpp",
243 "frm",
244 "fs",
245 "fsproj",
246 "fsx",
247 "ftn",
248 "gemrc",
249 "gemspec",
250 "gitattributes",
251 "gitconfig",
252 "gitignore",
253 "gitkeep",
254 "gitmodules",
255 "go",
256 "gpp",
257 "gradle",
258 "graphql",
259 "groovy",
260 "groupproj",
261 "grunit",
262 "gtmpl",
263 "gvimrc",
264 "h",
265 "haml",
266 "hbs",
267 "hgignore",
268 "hh",
269 "hpp",
270 "hrl",
271 "hs",
272 "hta",
273 "htaccess",
274 "htc",
275 "htm",
276 "html",
277 "htpasswd",
278 "hxx",
279 "iced",
280 "iml",
281 "inc",
282 "inf",
283 "info",
284 "ini",
285 "ino",
286 "int",
287 "irbrc",
288 "itcl",
289 "itermcolors",
290 "itk",
291 "jade",
292 "java",
293 "jhtm",
294 "jhtml",
295 "js",
296 "jscsrc",
297 "jshintignore",
298 "jshintrc",
299 "json",
300 "json5",
301 "jsonld",
302 "jsp",
303 "jspx",
304 "jsx",
305 "ksh",
306 "less",
307 "lhs",
308 "lisp",
309 "log",
310 "ls",
311 "lsp",
312 "lua",
313 "m",
314 "m4",
315 "mak",
316 "map",
317 "markdown",
318 "master",
319 "md",
320 "mdown",
321 "mdwn",
322 "mdx",
323 "metadata",
324 "mht",
325 "mhtml",
326 "mjs",
327 "mk",
328 "mkd",
329 "mkdn",
330 "mkdown",
331 "ml",
332 "mli",
333 "mm",
334 "mxml",
335 "nfm",
336 "nfo",
337 "noon",
338 "npmignore",
339 "npmrc",
340 "nuspec",
341 "nvmrc",
342 "ops",
343 "pas",
344 "pasm",
345 "patch",
346 "pbxproj",
347 "pch",
348 "pem",
349 "pg",
350 "php",
351 "php3",
352 "php4",
353 "php5",
354 "phpt",
355 "phtml",
356 "pir",
357 "pl",
358 "pm",
359 "pmc",
360 "pod",
361 "pot",
362 "prettierrc",
363 "properties",
364 "props",
365 "pt",
366 "pug",
367 "purs",
368 "py",
369 "pyx",
370 "r",
371 "rake",
372 "rb",
373 "rbw",
374 "rc",
375 "rdoc",
376 "rdoc_options",
377 "resx",
378 "rexx",
379 "rhtml",
380 "rjs",
381 "rlib",
382 "ron",
383 "rs",
384 "rss",
385 "rst",
386 "rtf",
387 "rvmrc",
388 "rxml",
389 "s",
390 "sass",
391 "scala",
392 "scm",
393 "scss",
394 "seestyle",
395 "sh",
396 "shtml",
397 "sln",
398 "sls",
399 "spec",
400 "sql",
401 "sqlite",
402 "sqlproj",
403 "srt",
404 "ss",
405 "sss",
406 "st",
407 "strings",
408 "sty",
409 "styl",
410 "stylus",
411 "sub",
412 "sublime-build",
413 "sublime-commands",
414 "sublime-completions",
415 "sublime-keymap",
416 "sublime-macro",
417 "sublime-menu",
418 "sublime-project",
419 "sublime-settings",
420 "sublime-workspace",
421 "sv",
422 "svc",
423 "svg",
424 "swift",
425 "t",
426 "tcl",
427 "tcsh",
428 "terminal",
429 "tex",
430 "text",
431 "textile",
432 "tg",
433 "tk",
434 "tmLanguage",
435 "tmpl",
436 "tmTheme",
437 "toml",
438 "tpl",
439 "ts",
440 "tsv",
441 "tsx",
442 "tt",
443 "tt2",
444 "ttml",
445 "twig",
446 "txt",
447 "v",
448 "vb",
449 "vbproj",
450 "vbs",
451 "vcproj",
452 "vcxproj",
453 "vh",
454 "vhd",
455 "vhdl",
456 "vim",
457 "viminfo",
458 "vimrc",
459 "vm",
460 "vue",
461 "webapp",
462 "webmanifest",
463 "wsc",
464 "x-php",
465 "xaml",
466 "xht",
467 "xhtml",
468 "xml",
469 "xs",
470 "xsd",
471 "xsl",
472 "xslt",
473 "y",
474 "yaml",
475 "yml",
476 "zsh",
477 "zshrc",
478 ]
479 .iter()
480 .copied()
481 .collect();
482}