television_utils/
files.rs

1use rustc_hash::FxHashSet;
2use std::fmt::Debug;
3use std::fs::File;
4use std::io::BufRead;
5use std::io::BufReader;
6use std::io::Read;
7use std::path::Path;
8use std::path::PathBuf;
9
10use ignore::{overrides::Override, types::TypesBuilder, WalkBuilder};
11use lazy_static::lazy_static;
12use tracing::{debug, warn};
13
14use crate::strings::{
15    proportion_of_printable_ascii_characters, PRINTABLE_ASCII_THRESHOLD,
16};
17use crate::threads::default_num_threads;
18
19pub struct PartialReadResult {
20    pub lines: Vec<String>,
21    pub bytes_read: usize,
22}
23
24pub enum ReadResult {
25    Partial(PartialReadResult),
26    Full(Vec<String>),
27    Error(String),
28}
29
30pub fn read_into_lines_capped<R>(r: R, max_bytes: usize) -> ReadResult
31where
32    R: Read,
33{
34    let mut buf_reader = BufReader::new(r);
35    let mut line = String::new();
36    let mut lines = Vec::new();
37    let mut bytes_read = 0;
38
39    loop {
40        line.clear();
41        match buf_reader.read_line(&mut line) {
42            Ok(0) => break,
43            Ok(_) => {
44                if bytes_read > max_bytes {
45                    break;
46                }
47                lines.push(line.trim_end().to_string());
48                bytes_read += line.len();
49            }
50            Err(e) => {
51                warn!("Error reading file: {:?}", e);
52                return ReadResult::Error(format!("{e:?}"));
53            }
54        }
55    }
56
57    if bytes_read > max_bytes {
58        ReadResult::Partial(PartialReadResult { lines, bytes_read })
59    } else {
60        ReadResult::Full(lines)
61    }
62}
63
64lazy_static::lazy_static! {
65    pub static ref DEFAULT_NUM_THREADS: usize = default_num_threads().into();
66}
67
68pub fn walk_builder(
69    path: &Path,
70    n_threads: usize,
71    overrides: Option<Override>,
72    ignore_paths: Option<Vec<PathBuf>>,
73) -> WalkBuilder {
74    let mut builder = WalkBuilder::new(path);
75
76    // ft-based filtering
77    let mut types_builder = TypesBuilder::new();
78    types_builder.add_defaults();
79    builder.types(types_builder.build().unwrap());
80
81    // ignore paths
82    if let Some(paths) = ignore_paths {
83        builder.filter_entry(move |e| {
84            let path = e.path();
85            if paths.iter().any(|p| path.starts_with(p)) {
86                debug!("Ignoring path: {:?}", path);
87                return false;
88            }
89            true
90        });
91    }
92
93    builder.threads(n_threads);
94    if let Some(ov) = overrides {
95        builder.overrides(ov);
96    }
97    builder
98}
99
100pub fn get_file_size(path: &Path) -> Option<u64> {
101    std::fs::metadata(path).ok().map(|m| m.len())
102}
103
104#[derive(Debug)]
105pub enum FileType {
106    Text,
107    Other,
108    Unknown,
109}
110
111impl<P> From<P> for FileType
112where
113    P: AsRef<Path> + Debug,
114{
115    fn from(path: P) -> Self {
116        debug!("Getting file type for {:?}", path);
117        let p = path.as_ref();
118        if is_known_text_extension(p) {
119            return FileType::Text;
120        }
121        if let Ok(mut f) = File::open(p) {
122            let mut buffer = [0u8; 256];
123            if let Ok(bytes_read) = f.read(&mut buffer) {
124                if bytes_read > 0
125                    && proportion_of_printable_ascii_characters(
126                        &buffer[..bytes_read],
127                    ) > PRINTABLE_ASCII_THRESHOLD
128                {
129                    return FileType::Text;
130                }
131            }
132        } else {
133            warn!("Error opening file: {:?}", path);
134        }
135        FileType::Other
136    }
137}
138
139pub fn is_known_text_extension<P>(path: P) -> bool
140where
141    P: AsRef<Path>,
142{
143    path.as_ref()
144        .extension()
145        .and_then(|ext| ext.to_str())
146        .is_some_and(|ext| KNOWN_TEXT_FILE_EXTENSIONS.contains(ext))
147}
148
149lazy_static! {
150    static ref KNOWN_TEXT_FILE_EXTENSIONS: FxHashSet<&'static str> = [
151        "ada",
152        "adb",
153        "ads",
154        "applescript",
155        "as",
156        "asc",
157        "ascii",
158        "ascx",
159        "asm",
160        "asmx",
161        "asp",
162        "aspx",
163        "atom",
164        "au3",
165        "awk",
166        "bas",
167        "bash",
168        "bashrc",
169        "bat",
170        "bbcolors",
171        "bcp",
172        "bdsgroup",
173        "bdsproj",
174        "bib",
175        "bowerrc",
176        "c",
177        "cbl",
178        "cc",
179        "cfc",
180        "cfg",
181        "cfm",
182        "cfml",
183        "cgi",
184        "cjs",
185        "clj",
186        "cljs",
187        "cls",
188        "cmake",
189        "cmd",
190        "cnf",
191        "cob",
192        "code-snippets",
193        "coffee",
194        "coffeekup",
195        "conf",
196        "cp",
197        "cpp",
198        "cpt",
199        "cpy",
200        "crt",
201        "cs",
202        "csh",
203        "cson",
204        "csproj",
205        "csr",
206        "css",
207        "csslintrc",
208        "csv",
209        "ctl",
210        "curlrc",
211        "cxx",
212        "d",
213        "dart",
214        "dfm",
215        "diff",
216        "dof",
217        "dpk",
218        "dpr",
219        "dproj",
220        "dtd",
221        "eco",
222        "editorconfig",
223        "ejs",
224        "el",
225        "elm",
226        "emacs",
227        "eml",
228        "ent",
229        "erb",
230        "erl",
231        "eslintignore",
232        "eslintrc",
233        "ex",
234        "exs",
235        "f",
236        "f03",
237        "f77",
238        "f90",
239        "f95",
240        "fish",
241        "for",
242        "fpp",
243        "frm",
244        "fs",
245        "fsproj",
246        "fsx",
247        "ftn",
248        "gemrc",
249        "gemspec",
250        "gitattributes",
251        "gitconfig",
252        "gitignore",
253        "gitkeep",
254        "gitmodules",
255        "go",
256        "gpp",
257        "gradle",
258        "graphql",
259        "groovy",
260        "groupproj",
261        "grunit",
262        "gtmpl",
263        "gvimrc",
264        "h",
265        "haml",
266        "hbs",
267        "hgignore",
268        "hh",
269        "hpp",
270        "hrl",
271        "hs",
272        "hta",
273        "htaccess",
274        "htc",
275        "htm",
276        "html",
277        "htpasswd",
278        "hxx",
279        "iced",
280        "iml",
281        "inc",
282        "inf",
283        "info",
284        "ini",
285        "ino",
286        "int",
287        "irbrc",
288        "itcl",
289        "itermcolors",
290        "itk",
291        "jade",
292        "java",
293        "jhtm",
294        "jhtml",
295        "js",
296        "jscsrc",
297        "jshintignore",
298        "jshintrc",
299        "json",
300        "json5",
301        "jsonld",
302        "jsp",
303        "jspx",
304        "jsx",
305        "ksh",
306        "less",
307        "lhs",
308        "lisp",
309        "log",
310        "ls",
311        "lsp",
312        "lua",
313        "m",
314        "m4",
315        "mak",
316        "map",
317        "markdown",
318        "master",
319        "md",
320        "mdown",
321        "mdwn",
322        "mdx",
323        "metadata",
324        "mht",
325        "mhtml",
326        "mjs",
327        "mk",
328        "mkd",
329        "mkdn",
330        "mkdown",
331        "ml",
332        "mli",
333        "mm",
334        "mxml",
335        "nfm",
336        "nfo",
337        "noon",
338        "npmignore",
339        "npmrc",
340        "nuspec",
341        "nvmrc",
342        "ops",
343        "pas",
344        "pasm",
345        "patch",
346        "pbxproj",
347        "pch",
348        "pem",
349        "pg",
350        "php",
351        "php3",
352        "php4",
353        "php5",
354        "phpt",
355        "phtml",
356        "pir",
357        "pl",
358        "pm",
359        "pmc",
360        "pod",
361        "pot",
362        "prettierrc",
363        "properties",
364        "props",
365        "pt",
366        "pug",
367        "purs",
368        "py",
369        "pyx",
370        "r",
371        "rake",
372        "rb",
373        "rbw",
374        "rc",
375        "rdoc",
376        "rdoc_options",
377        "resx",
378        "rexx",
379        "rhtml",
380        "rjs",
381        "rlib",
382        "ron",
383        "rs",
384        "rss",
385        "rst",
386        "rtf",
387        "rvmrc",
388        "rxml",
389        "s",
390        "sass",
391        "scala",
392        "scm",
393        "scss",
394        "seestyle",
395        "sh",
396        "shtml",
397        "sln",
398        "sls",
399        "spec",
400        "sql",
401        "sqlite",
402        "sqlproj",
403        "srt",
404        "ss",
405        "sss",
406        "st",
407        "strings",
408        "sty",
409        "styl",
410        "stylus",
411        "sub",
412        "sublime-build",
413        "sublime-commands",
414        "sublime-completions",
415        "sublime-keymap",
416        "sublime-macro",
417        "sublime-menu",
418        "sublime-project",
419        "sublime-settings",
420        "sublime-workspace",
421        "sv",
422        "svc",
423        "svg",
424        "swift",
425        "t",
426        "tcl",
427        "tcsh",
428        "terminal",
429        "tex",
430        "text",
431        "textile",
432        "tg",
433        "tk",
434        "tmLanguage",
435        "tmpl",
436        "tmTheme",
437        "toml",
438        "tpl",
439        "ts",
440        "tsv",
441        "tsx",
442        "tt",
443        "tt2",
444        "ttml",
445        "twig",
446        "txt",
447        "v",
448        "vb",
449        "vbproj",
450        "vbs",
451        "vcproj",
452        "vcxproj",
453        "vh",
454        "vhd",
455        "vhdl",
456        "vim",
457        "viminfo",
458        "vimrc",
459        "vm",
460        "vue",
461        "webapp",
462        "webmanifest",
463        "wsc",
464        "x-php",
465        "xaml",
466        "xht",
467        "xhtml",
468        "xml",
469        "xs",
470        "xsd",
471        "xsl",
472        "xslt",
473        "y",
474        "yaml",
475        "yml",
476        "zsh",
477        "zshrc",
478    ]
479    .iter()
480    .copied()
481    .collect();
482}