big_code_analysis/
tools.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::wildcard_imports, clippy::enum_glob_use)]
8// Metric counts (token, function, branch, argument, etc.) are stored as
9// `usize` and crossed with `f64` averages, ratios, and Halstead scores
10// across the cyclomatic / MI / Halstead computations. The `usize as f64`
11// and `f64 as usize` casts are intentional and snapshot-anchored — every
12// site is bounded by the count it came from. Allowing the lints at the
13// module level keeps the metric arithmetic legible.
14#![allow(
15    clippy::cast_precision_loss,
16    clippy::cast_possible_truncation,
17    clippy::cast_sign_loss
18)]
19
20use std::cmp::Ordering;
21use std::collections::HashMap;
22use std::fs::{self, File};
23use std::io::{Read, Write};
24use std::path::{Component, Path, PathBuf};
25use std::sync::OnceLock;
26
27use regex::bytes::Regex;
28use termcolor::{Color, ColorSpec, StandardStreamLock, WriteColor};
29
30use crate::langs::fake;
31use crate::langs::*;
32
33/// Reads a file, normalising all CR-only and CRLF line endings to LF.
34///
35/// **Note for downstream consumers**: the returned buffer never contains `\r`
36/// bytes. Callers that previously observed raw `\r\n` sequences will see plain
37/// `\n` after this call. This is intentional — the metric engine requires LF-
38/// only input — but it is a behavioural difference from a plain `fs::read`.
39///
40/// # Errors
41///
42/// Returns any [`std::io::Error`] surfaced by [`File::open`] (the
43/// path is missing, lacks read permission, is a directory, …) or by
44/// [`File::read_to_end`] while reading the file contents.
45///
46/// # Examples
47///
48/// ```
49/// use std::path::Path;
50///
51/// use big_code_analysis::read_file;
52///
53/// let path = Path::new("Cargo.toml");
54/// read_file(&path).unwrap();
55/// ```
56pub fn read_file(path: &Path) -> std::io::Result<Vec<u8>> {
57    let mut file = File::open(path)?;
58    let mut data = Vec::new();
59    file.read_to_end(&mut data)?;
60
61    normalize_line_endings(&mut data);
62
63    Ok(data)
64}
65
66/// Reads a file, normalising all CR-only and CRLF line endings to LF, and ensures
67/// the buffer ends with exactly one `\n`. Returns `None` for files ≤ 3 bytes or
68/// files that appear to be non-UTF-8.
69///
70/// # Errors
71///
72/// Returns any [`std::io::Error`] surfaced by [`File::open`] (the
73/// path is missing, lacks read permission, is a directory, …) or by
74/// the subsequent reads from the open file handle. A non-UTF-8 head
75/// or a too-small file is reported via `Ok(None)`, not an error.
76///
77/// # Examples
78///
79/// ```
80/// use std::path::Path;
81///
82/// use big_code_analysis::read_file_with_eol;
83///
84/// let path = Path::new("Cargo.toml");
85/// read_file_with_eol(&path).unwrap();
86/// ```
87pub fn read_file_with_eol(path: &Path) -> std::io::Result<Option<Vec<u8>>> {
88    let file_size = fs::metadata(path).map_or(1024 * 1024, |m| m.len() as usize);
89    if file_size <= 3 {
90        // this file is very likely almost empty... so nothing to do on it
91        return Ok(None);
92    }
93
94    let mut file = File::open(path)?;
95
96    let mut start = vec![0; 64.min(file_size)];
97    let start = if file.read_exact(&mut start).is_ok() {
98        // Skip the bom if one
99        if start[..2] == [b'\xFE', b'\xFF'] || start[..2] == [b'\xFF', b'\xFE'] {
100            &start[2..]
101        } else if start[..3] == [b'\xEF', b'\xBB', b'\xBF'] {
102            &start[3..]
103        } else {
104            &start
105        }
106    } else {
107        return Ok(None);
108    };
109
110    // so start contains more or less 64 chars
111    let mut head = String::from_utf8_lossy(start).into_owned();
112    // The last char could be wrong because we were in the middle of an utf-8 sequence
113    head.pop();
114    // now check if there is an invalid char
115    if head.contains('\u{FFFD}') {
116        return Ok(None);
117    }
118
119    let mut data = Vec::with_capacity(file_size + 2);
120    data.extend_from_slice(start);
121
122    file.read_to_end(&mut data)?;
123
124    normalize_line_endings(&mut data);
125
126    Ok(Some(data))
127}
128
129/// Writes data to a file.
130///
131/// # Errors
132///
133/// Returns any [`std::io::Error`] surfaced by [`File::create`]
134/// (parent directory missing, lacks write permission, target is a
135/// directory, …) or by [`File::write_all`] while writing the buffer.
136///
137/// # Examples
138///
139/// ```no_run
140/// use std::path::Path;
141///
142/// use big_code_analysis::write_file;
143///
144/// let path = Path::new("foo.txt");
145/// let data: [u8; 4] = [0; 4];
146/// write_file(&path, &data).unwrap();
147/// ```
148pub fn write_file(path: &Path, data: &[u8]) -> std::io::Result<()> {
149    let mut file = File::create(path)?;
150    file.write_all(data)?;
151
152    Ok(())
153}
154
155/// Detects the language of a code using
156/// the extension of a file.
157///
158/// # Examples
159///
160/// ```
161/// use std::path::Path;
162///
163/// use big_code_analysis::get_language_for_file;
164///
165/// let path = Path::new("build.rs");
166/// get_language_for_file(&path).unwrap();
167/// ```
168#[must_use]
169pub fn get_language_for_file(path: &Path) -> Option<LANG> {
170    if let Some(ext) = path.extension() {
171        let ext = ext.to_str()?.to_lowercase();
172        get_from_ext(&ext)
173    } else {
174        None
175    }
176}
177
178fn mode_to_str(mode: &[u8]) -> Option<String> {
179    std::str::from_utf8(mode).ok().map(str::to_lowercase)
180}
181
182// comment containing coding info are useful
183static RE1_EMACS: OnceLock<Regex> = OnceLock::new();
184static RE2_EMACS: OnceLock<Regex> = OnceLock::new();
185static RE1_VIM: OnceLock<Regex> = OnceLock::new();
186static RE_GENERATED: OnceLock<Regex> = OnceLock::new();
187
188// Regular expressions
189const FIRST_EMACS_EXPRESSION: &str = r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)";
190const SECOND_EMACS_EXPRESSION: &str = r"-\*-\s*([^:;\s]+)\s*-\*-";
191const VIM_EXPRESSION: &str = r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)";
192
193// Generated-code marker patterns. Matched against the leading window of the
194// file (see `is_generated`) so a marker phrase deep in the body does not
195// trigger a skip. Each alternative covers a widely-used convention:
196//
197// - `@generated`      — Facebook / Meta convention, also used by buck2,
198//                       rustfmt, prettier, and many code generators.
199// - `DO NOT EDIT`     — Go's `Code generated ... DO NOT EDIT.` line is
200//                       canonical, but the bare phrase appears in Bazel,
201//                       protoc, OpenAPI clients, etc. — match either.
202// - `GENERATED CODE`  — Lizard's marker; preserved for compatibility with
203//                       projects that already tag generated files this way.
204const GENERATED_EXPRESSION: &str = r"(?i)@generated\b|DO NOT EDIT|GENERATED CODE";
205
206/// Bytes from the start of the file scanned for a generated-code marker.
207/// 5 KiB is enough to cover any reasonable file header (license + autogen
208/// preamble) without paying a meaningful read cost.
209const GENERATED_SCAN_BYTES: usize = 5 * 1024;
210/// Maximum lines scanned for a generated-code marker. Caps the work on a
211/// pathological "all-on-one-line" file.
212const GENERATED_SCAN_LINES: usize = 50;
213
214/// Returns `true` when `buf` looks like generated code: its leading window
215/// (first ~50 lines or first 5 KiB, whichever is smaller) contains a known
216/// marker phrase. Matching is case-insensitive for the marker and never
217/// allocates on the negative path.
218///
219/// Recognized markers:
220///
221/// - `@generated` — Facebook / Meta convention, also used by buck2,
222///   rustfmt, and prettier.
223/// - `DO NOT EDIT` — Go's `Code generated by ... DO NOT EDIT.` is the
224///   canonical form; the bare phrase is also widely copied.
225/// - `GENERATED CODE` — Lizard's marker, preserved for compatibility.
226///
227/// Detection runs against raw bytes before parsing, so callers can discard
228/// generated files without paying tree-sitter parse cost. Non-UTF-8 input
229/// will not panic — `regex::bytes::Regex` operates on the raw byte slice.
230///
231/// # Examples
232///
233/// ```
234/// use big_code_analysis::is_generated;
235///
236/// assert!(is_generated(b"// @generated\nfn x() {}\n"));
237/// assert!(is_generated(
238///     b"// Code generated by protoc. DO NOT EDIT.\npackage x\n",
239/// ));
240/// assert!(!is_generated(b"fn main() { /* not generated */ }\n"));
241/// ```
242///
243/// # Panics
244///
245/// Panics if the embedded marker regex set fails to build; the marker
246/// list is a static literal so this represents a compile-time bug, not
247/// a runtime input that can be handled.
248pub fn is_generated(buf: &[u8]) -> bool {
249    // Strip a leading UTF-8 BOM so a marker on the first line of a
250    // BOM-prefixed file still matches against the line start. UTF-16 BOMs
251    // are not handled: the byte-pattern regex cannot match the
252    // interleaved-zero encoding (`@\x00g\x00...`) that follows a UTF-16
253    // BOM, so a strip would not enable detection — it would only obscure
254    // the fact that UTF-16 source files are unsupported here.
255    let buf = buf.strip_prefix(b"\xEF\xBB\xBF").unwrap_or(buf);
256
257    // Bound the search window: at most GENERATED_SCAN_BYTES bytes, and
258    // among those, stop after GENERATED_SCAN_LINES newlines. Scanning fewer
259    // lines avoids matching a marker phrase deep in the file body (the
260    // negative case in the issue's acceptance criteria).
261    let cap = buf.len().min(GENERATED_SCAN_BYTES);
262    let end = buf[..cap]
263        .iter()
264        .enumerate()
265        .filter_map(|(i, &b)| (b == b'\n').then_some(i + 1))
266        .nth(GENERATED_SCAN_LINES - 1)
267        .unwrap_or(cap);
268    let window = &buf[..end];
269
270    RE_GENERATED
271        .get_or_init(|| {
272            Regex::new(GENERATED_EXPRESSION).expect("GENERATED_EXPRESSION is a constant regex")
273        })
274        .is_match(window)
275}
276
277#[inline]
278fn get_regex<'a>(
279    once_lock: &OnceLock<Regex>,
280    line: &'a [u8],
281    regex: &'a str,
282) -> Option<regex::bytes::Captures<'a>> {
283    once_lock
284        .get_or_init(|| Regex::new(regex).unwrap())
285        .captures_iter(line)
286        .next()
287}
288
289/// Resolves a language from a script's shebang line.
290///
291/// Returns `None` unless `buf` starts with `#!`. Reads up to the first `\n`,
292/// strips an optional trailing `\r`, splits on whitespace, and takes the
293/// basename of either the first token or — when that basename is `env` — the
294/// next non-flag token. Trailing version digits and dots (`python3`,
295/// `lua5.1`, `perl5.36`) are stripped before lookup. Non-UTF-8 bytes on the
296/// shebang line yield `None` (no panic).
297fn get_shebang_lang(buf: &[u8]) -> Option<LANG> {
298    // Early-out for the common case (any non-shebang buffer): no allocation,
299    // no UTF-8 decoding.
300    let rest = buf.strip_prefix(b"#!")?;
301    let line_end = rest.iter().position(|&b| b == b'\n').unwrap_or(rest.len());
302    let line = &rest[..line_end];
303    // Trim a trailing CR even though normalize_line_endings should have removed
304    // it — guess_language is on the public API and may be called with raw input.
305    let line = line.strip_suffix(b"\r").unwrap_or(line);
306    let line = std::str::from_utf8(line).ok()?;
307
308    let mut tokens = line.split_ascii_whitespace();
309    let first_base = basename(tokens.next()?);
310
311    let interpreter = if first_base == "env" {
312        skip_env_args(&mut tokens)?
313    } else {
314        first_base
315    };
316
317    get_from_interpreter(strip_version_suffix(interpreter))
318}
319
320// Walk past leading `env` arguments (`-FLAG`, `-u VAR`, `NAME=value`) and
321// return the basename of the actual interpreter token. Per `env(1)`, only
322// `-u` consumes a following argument; other short flags (`-i`, `-S`, …)
323// stand alone or carry their argument inline (e.g. `-S "node --foo"`).
324fn skip_env_args<'a>(tokens: &mut std::str::SplitAsciiWhitespace<'a>) -> Option<&'a str> {
325    loop {
326        let tok = tokens.next()?;
327        if let Some(flag) = tok.strip_prefix('-') {
328            if flag == "u" {
329                tokens.next()?;
330            }
331            continue;
332        }
333        if tok.contains('=') {
334            continue;
335        }
336        return Some(basename(tok));
337    }
338}
339
340fn basename(path: &str) -> &str {
341    path.rsplit_once('/').map_or(path, |(_, name)| name)
342}
343
344/// Strips a trailing run of digits and dots used to encode an interpreter
345/// version (`python3` → `python`, `lua5.1` → `lua`, `perl5.36` → `perl`).
346fn strip_version_suffix(name: &str) -> &str {
347    let trimmed = name.trim_end_matches(|c: char| c.is_ascii_digit() || c == '.');
348    if trimmed.is_empty() { name } else { trimmed }
349}
350
351fn get_from_interpreter(name: &str) -> Option<LANG> {
352    match name {
353        "sh" | "bash" | "dash" | "ksh" | "zsh" => Some(LANG::Bash),
354        "python" => Some(LANG::Python),
355        "perl" => Some(LANG::Perl),
356        "lua" | "luajit" => Some(LANG::Lua),
357        "php" | "php-cgi" => Some(LANG::Php),
358        "node" | "nodejs" => Some(LANG::Javascript),
359        "tclsh" | "wish" => Some(LANG::Tcl),
360        "ruby" => Some(LANG::Ruby),
361        "elixir" | "iex" => Some(LANG::Elixir),
362        _ => None,
363    }
364}
365
366fn get_emacs_mode(buf: &[u8]) -> Option<String> {
367    // we just try to use the emacs info (if there)
368    for (i, line) in buf.splitn(5, |c| *c == b'\n').enumerate() {
369        if let Some(cap) = get_regex(&RE1_EMACS, line, FIRST_EMACS_EXPRESSION) {
370            return mode_to_str(&cap[1]);
371        } else if let Some(cap) = get_regex(&RE2_EMACS, line, SECOND_EMACS_EXPRESSION) {
372            return mode_to_str(&cap[1]);
373        } else if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
374            return mode_to_str(&cap[1]);
375        }
376        if i == 3 {
377            break;
378        }
379    }
380
381    for (i, line) in buf.rsplitn(5, |c| *c == b'\n').enumerate() {
382        if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
383            return mode_to_str(&cap[1]);
384        }
385        if i == 3 {
386            break;
387        }
388    }
389
390    None
391}
392
393/// Guesses the language of a code.
394///
395/// Returns a tuple containing a [`LANG`] as first argument
396/// and the language name as a second one.
397///
398/// # Examples
399///
400/// ```
401/// use std::path::PathBuf;
402///
403/// use big_code_analysis::guess_language;
404///
405/// let source_code = "int a = 42;";
406///
407/// // The path to a dummy file used to contain the source code
408/// let path = PathBuf::from("foo.c");
409/// let source_slice = source_code.as_bytes();
410///
411/// // Guess the language of a code
412/// guess_language(&source_slice, &path);
413/// ```
414///
415/// [`LANG`]: enum.LANG.html
416pub fn guess_language<'a, P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, &'a str) {
417    let ext = path
418        .as_ref()
419        .extension()
420        .and_then(|e| e.to_str())
421        .map(str::to_lowercase)
422        .unwrap_or_default();
423    let from_ext = get_from_ext(&ext);
424
425    let mode = get_emacs_mode(buf).unwrap_or_default();
426
427    let from_mode = get_from_emacs_mode(&mode);
428
429    if let Some(lang_ext) = from_ext {
430        if let Some(lang_mode) = from_mode {
431            if lang_ext == lang_mode {
432                (
433                    Some(lang_mode),
434                    fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
435                )
436            } else {
437                // we should probably rely on extension here
438                (Some(lang_ext), lang_ext.get_name())
439            }
440        } else {
441            (
442                Some(lang_ext),
443                fake::get_true(&ext, &mode).unwrap_or_else(|| lang_ext.get_name()),
444            )
445        }
446    } else if let Some(lang_mode) = from_mode {
447        (
448            Some(lang_mode),
449            fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
450        )
451    } else if let Some(lang_shebang) = get_shebang_lang(buf) {
452        (
453            Some(lang_shebang),
454            fake::get_true(&ext, &mode).unwrap_or_else(|| lang_shebang.get_name()),
455        )
456    } else {
457        (None, fake::get_true(&ext, &mode).unwrap_or_default())
458    }
459}
460
461/// Normalises all CR-only and CRLF line endings to LF throughout the buffer,
462/// then ensures the buffer ends with exactly one `\n`.
463pub(crate) fn normalize_line_endings(data: &mut Vec<u8>) {
464    // In-place compaction: write pointer stays ≤ read pointer, so no extra allocation.
465    let mut w = 0;
466    let mut r = 0;
467    while r < data.len() {
468        if data[r] == b'\r' {
469            data[w] = b'\n';
470            w += 1;
471            r += if data.get(r + 1).copied() == Some(b'\n') {
472                2
473            } else {
474                1
475            };
476        } else {
477            data[w] = data[r];
478            w += 1;
479            r += 1;
480        }
481    }
482    data.truncate(w);
483    let trailing = data.iter().rev().take_while(|&&c| c == b'\n').count();
484    data.truncate(data.len() - trailing);
485    data.push(b'\n');
486}
487
488pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
489    // Copied from Cargo sources: https://github.com/rust-lang/cargo/blob/master/src/cargo/util/paths.rs#L65
490    let mut components = path.as_ref().components().peekable();
491    let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().copied() {
492        components.next();
493        PathBuf::from(c.as_os_str())
494    } else {
495        PathBuf::new()
496    };
497
498    for component in components {
499        match component {
500            Component::Prefix(..) => unreachable!(),
501            Component::RootDir => {
502                ret.push(component.as_os_str());
503            }
504            Component::CurDir => {}
505            Component::ParentDir => {
506                ret.pop();
507            }
508            Component::Normal(c) => {
509                ret.push(c);
510            }
511        }
512    }
513    ret
514}
515
516pub(crate) fn get_paths_dist(path1: &Path, path2: &Path) -> Option<usize> {
517    for ancestor in path1.ancestors() {
518        if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
519            let path1 = path1.strip_prefix(ancestor).unwrap();
520            let path2 = path2.strip_prefix(ancestor).unwrap();
521            return Some(path1.components().count() + path2.components().count());
522        }
523    }
524    None
525}
526
527pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
528    current_path: &Path,
529    include_path: &str,
530    all_files: &HashMap<String, Vec<PathBuf>, S>,
531) -> Vec<PathBuf> {
532    let include_path = include_path
533        .strip_prefix("mozilla/")
534        .unwrap_or(include_path);
535
536    // Resolve the include relative to the including file's parent
537    // before normalizing. This preserves leading `..` traversal so
538    // `#include "../foo.h"` from `src/lib/file.c` targets
539    // `src/foo.h`, not the lexically-popped `foo.h` (issue #297).
540    // Lexical-only normalization is required because `current_path`
541    // and the entries in `all_files` are typically not canonicalized
542    // and the included header need not exist on disk yet.
543    let resolved_path = current_path
544        .parent()
545        .map(|parent| normalize_path(parent.join(include_path)));
546
547    let include_path = normalize_path(include_path);
548    let Some(file_name) = include_path.file_name() else {
549        return vec![];
550    };
551    let Some(file_name) = file_name.to_str() else {
552        return vec![];
553    };
554    if let Some(possibilities) = all_files.get(file_name) {
555        if possibilities.len() == 1 {
556            // Only one file with this name
557            return possibilities.clone();
558        }
559
560        // Strongest signal: a candidate matches the fully resolved
561        // relative target. Prefer exact equality, then suffix match
562        // (so absolute `all_files` entries still match a relative
563        // resolved target like `src/foo.h`).
564        if let Some(resolved) = resolved_path.as_ref() {
565            fn unique_match<F: Fn(&PathBuf) -> bool>(
566                possibilities: &[PathBuf],
567                current_path: &Path,
568                pred: F,
569            ) -> Option<Vec<PathBuf>> {
570                let matched: Vec<PathBuf> = possibilities
571                    .iter()
572                    .filter(|p| current_path != p.as_path() && pred(p))
573                    .cloned()
574                    .collect();
575                (matched.len() == 1).then_some(matched)
576            }
577            if let Some(hit) = unique_match(possibilities, current_path, |p| p == resolved) {
578                return hit;
579            }
580            if let Some(hit) = unique_match(possibilities, current_path, |p| p.ends_with(resolved))
581            {
582                return hit;
583            }
584        }
585
586        let mut new_possibilities = Vec::new();
587        for p in possibilities {
588            if p.ends_with(&include_path) && current_path != p {
589                new_possibilities.push(p.clone());
590            }
591        }
592        if new_possibilities.len() == 1 {
593            // Only one path is finishing with "foo/Bar.h"
594            return new_possibilities;
595        }
596        new_possibilities.clear();
597
598        if let Some(parent) = current_path.parent() {
599            for p in possibilities {
600                if p.starts_with(parent) && current_path != p {
601                    new_possibilities.push(p.clone());
602                }
603            }
604            if new_possibilities.len() == 1 {
605                // Only one path in the current working directory (current_path)
606                return new_possibilities;
607            }
608            new_possibilities.clear();
609        }
610
611        let mut dist_min = usize::MAX;
612        let mut path_min = Vec::new();
613        for p in possibilities {
614            if current_path == p {
615                continue;
616            }
617            if let Some(dist) = get_paths_dist(current_path, p) {
618                match dist.cmp(&dist_min) {
619                    Ordering::Less => {
620                        dist_min = dist;
621                        path_min.clear();
622                        path_min.push(p);
623                    }
624                    Ordering::Equal => {
625                        path_min.push(p);
626                    }
627                    Ordering::Greater => {}
628                }
629            }
630        }
631
632        let path_min: Vec<_> = path_min.drain(..).cloned().collect();
633        return path_min;
634    }
635
636    vec![]
637}
638
639#[inline]
640pub(crate) fn color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
641    stdout.set_color(ColorSpec::new().set_fg(Some(color)))
642}
643
644#[inline]
645pub(crate) fn intense_color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
646    stdout.set_color(ColorSpec::new().set_fg(Some(color)).set_intense(true))
647}
648
649#[cfg(test)]
650pub(crate) fn check_func_space<T: crate::ParserTrait, F: Fn(crate::FuncSpace)>(
651    source: &str,
652    filename: &str,
653    check: F,
654) {
655    let path = std::path::PathBuf::from(filename);
656    // Mirror the CRLF/CR normalisation that read_file_with_eol applies via normalize_line_endings
657    let normalized = source.replace("\r\n", "\n").replace('\r', "\n");
658    let mut trimmed_bytes = normalized.trim_end().trim_matches('\n').as_bytes().to_vec();
659    trimmed_bytes.push(b'\n');
660    let parser = T::new(trimmed_bytes, &path, None);
661    #[allow(deprecated)]
662    let func_space = crate::metrics(&parser, &path).unwrap();
663
664    check(func_space);
665}
666
667#[cfg(test)]
668pub(crate) fn check_metrics<T: crate::ParserTrait>(
669    source: &str,
670    filename: &str,
671    check: fn(crate::CodeMetrics) -> (),
672) {
673    check_func_space::<T, _>(source, filename, |func_space| check(func_space.metrics));
674}
675
676/// Asserts that `func_space` has a direct child space named `name` and that
677/// its `kind` matches `expected`.
678///
679/// Used by annotation-type / class / interface tests that need to verify
680/// the structural FuncSpace tree (not just metric values), since vacuous
681/// metric assertions can pass even when `is_func_space` has been reverted
682/// for the node kind under test.
683#[cfg(test)]
684pub(crate) fn assert_child_space_kind(
685    func_space: &crate::FuncSpace,
686    name: &str,
687    expected: crate::SpaceKind,
688) {
689    let child = func_space
690        .spaces
691        .iter()
692        .find(|s| s.name.as_deref() == Some(name))
693        .unwrap_or_else(|| panic!("expected a child FuncSpace named {name:?}"));
694    assert_eq!(
695        child.kind, expected,
696        "child FuncSpace {name:?} kind: got {:?}, expected {:?}",
697        child.kind, expected,
698    );
699}
700
701#[cfg(test)]
702#[allow(
703    clippy::float_cmp,
704    clippy::cast_precision_loss,
705    clippy::cast_possible_truncation,
706    clippy::cast_sign_loss,
707    clippy::similar_names,
708    clippy::doc_markdown,
709    clippy::needless_raw_string_hashes,
710    clippy::too_many_lines
711)]
712mod tests {
713    use pretty_assertions::assert_eq;
714
715    use super::*;
716
717    #[test]
718    fn test_read() {
719        let tmp_dir = std::env::temp_dir();
720        let tmp_path = tmp_dir.join("test_read");
721        let data = vec![
722            (b"\xFF\xFEabc".to_vec(), Some(b"abc\n".to_vec())),
723            (b"\xFE\xFFabc".to_vec(), Some(b"abc\n".to_vec())),
724            (b"\xEF\xBB\xBFabc".to_vec(), Some(b"abc\n".to_vec())),
725            (b"\xEF\xBB\xBFabc\n".to_vec(), Some(b"abc\n".to_vec())),
726            (b"\xEF\xBBabc\n".to_vec(), None),
727            (b"abcdef\n".to_vec(), Some(b"abcdef\n".to_vec())),
728            (b"abcdef".to_vec(), Some(b"abcdef\n".to_vec())),
729            // CRLF throughout should be normalised to LF
730            (b"abc\r\ndef\r\n".to_vec(), Some(b"abc\ndef\n".to_vec())),
731            // UTF-8 BOM + CRLF
732            (
733                b"\xEF\xBB\xBFabc\r\ndef\r\n".to_vec(),
734                Some(b"abc\ndef\n".to_vec()),
735            ),
736        ];
737        for (d, expected) in data {
738            write_file(&tmp_path, &d).unwrap();
739            let res = read_file_with_eol(&tmp_path).unwrap();
740            assert_eq!(res, expected);
741        }
742    }
743
744    #[cfg(unix)]
745    #[test]
746    fn test_get_language_for_file_non_utf8() {
747        use std::ffi::OsStr;
748        use std::os::unix::ffi::OsStrExt;
749
750        let path = Path::new(OsStr::from_bytes(b"foo.\xff"));
751        assert_eq!(get_language_for_file(path), None);
752    }
753
754    #[cfg(unix)]
755    #[test]
756    fn test_guess_language_non_utf8() {
757        use std::ffi::OsStr;
758        use std::os::unix::ffi::OsStrExt;
759        use std::path::PathBuf;
760
761        let path = PathBuf::from(OsStr::from_bytes(b"foo.\xff"));
762        let (lang, _name) = guess_language(b"int a = 42;", &path);
763        assert_eq!(lang, None);
764    }
765
766    #[test]
767    fn test_guess_file_no_file_name() {
768        let all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
769        let current = Path::new("/some/file.c");
770        let result = guess_file(current, "..", &all_files);
771        assert!(result.is_empty());
772    }
773
774    /// Regression for issue #297: `#include "../foo.h"` from
775    /// `src/lib/file.c` must resolve to `src/foo.h`, not the
776    /// same-directory `src/lib/foo.h` that the prior lexical
777    /// `normalize_path` collapse left as the closest match.
778    #[test]
779    fn guess_file_parent_dir_include_resolves_to_sibling() {
780        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
781        all_files.insert(
782            "foo.h".to_string(),
783            vec![
784                PathBuf::from("/proj/src/foo.h"),
785                PathBuf::from("/proj/src/lib/foo.h"),
786            ],
787        );
788        let current = Path::new("/proj/src/lib/file.c");
789        let result = guess_file(current, "../foo.h", &all_files);
790        assert_eq!(result, vec![PathBuf::from("/proj/src/foo.h")]);
791    }
792
793    /// `../inc/foo.h` from `src/lib/file.c` must resolve to
794    /// `src/inc/foo.h`, not some other `inc/foo.h` deeper in the
795    /// tree.
796    #[test]
797    fn guess_file_parent_subdir_include_resolves_to_correct_inc() {
798        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
799        all_files.insert(
800            "foo.h".to_string(),
801            vec![
802                PathBuf::from("/proj/src/inc/foo.h"),
803                PathBuf::from("/proj/src/lib/inc/foo.h"),
804                PathBuf::from("/proj/other/inc/foo.h"),
805            ],
806        );
807        let current = Path::new("/proj/src/lib/file.c");
808        let result = guess_file(current, "../inc/foo.h", &all_files);
809        assert_eq!(result, vec![PathBuf::from("/proj/src/inc/foo.h")]);
810    }
811
812    /// A plain `foo.h` include from `src/lib/file.c` must keep the
813    /// existing same-directory preference and resolve to
814    /// `src/lib/foo.h`.
815    #[test]
816    fn guess_file_plain_include_keeps_same_directory_preference() {
817        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
818        all_files.insert(
819            "foo.h".to_string(),
820            vec![
821                PathBuf::from("/proj/src/foo.h"),
822                PathBuf::from("/proj/src/lib/foo.h"),
823            ],
824        );
825        let current = Path::new("/proj/src/lib/file.c");
826        let result = guess_file(current, "foo.h", &all_files);
827        assert_eq!(result, vec![PathBuf::from("/proj/src/lib/foo.h")]);
828    }
829
830    /// A `./foo.h` include from `src/lib/file.c` must still resolve
831    /// to the same-directory `src/lib/foo.h` (CurDir segments are
832    /// collapsed before joining).
833    #[test]
834    fn guess_file_curdir_include_resolves_to_same_directory() {
835        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
836        all_files.insert(
837            "foo.h".to_string(),
838            vec![
839                PathBuf::from("/proj/src/foo.h"),
840                PathBuf::from("/proj/src/lib/foo.h"),
841            ],
842        );
843        let current = Path::new("/proj/src/lib/file.c");
844        let result = guess_file(current, "./foo.h", &all_files);
845        assert_eq!(result, vec![PathBuf::from("/proj/src/lib/foo.h")]);
846    }
847
848    /// `../../foo.h` from `src/a/b/file.c` must resolve up two
849    /// levels to `src/foo.h`, not be lexically collapsed.
850    #[test]
851    fn guess_file_double_parent_include_resolves_two_levels_up() {
852        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
853        all_files.insert(
854            "foo.h".to_string(),
855            vec![
856                PathBuf::from("/proj/src/foo.h"),
857                PathBuf::from("/proj/src/a/foo.h"),
858                PathBuf::from("/proj/src/a/b/foo.h"),
859            ],
860        );
861        let current = Path::new("/proj/src/a/b/file.c");
862        let result = guess_file(current, "../../foo.h", &all_files);
863        assert_eq!(result, vec![PathBuf::from("/proj/src/foo.h")]);
864    }
865
866    /// When the relative target does not match any candidate
867    /// exactly, the existing basename / same-directory / distance
868    /// fallback chain still applies. With a single candidate, that
869    /// candidate is returned even if its path differs from the
870    /// resolved target.
871    #[test]
872    fn guess_file_unique_basename_returns_only_candidate() {
873        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
874        all_files.insert(
875            "foo.h".to_string(),
876            vec![PathBuf::from("/proj/src/lib/foo.h")],
877        );
878        let current = Path::new("/proj/src/lib/file.c");
879        // Resolved target would be `/proj/foo.h`, which does not
880        // exist; the unique-basename short-circuit still wins.
881        let result = guess_file(current, "../../foo.h", &all_files);
882        assert_eq!(result, vec![PathBuf::from("/proj/src/lib/foo.h")]);
883    }
884
885    /// The `mozilla/` prefix strip must still apply, so
886    /// `#include "mozilla/foo.h"` from `src/lib/file.c` resolves
887    /// the same way a bare `foo.h` would.
888    #[test]
889    fn guess_file_mozilla_prefix_is_stripped_before_resolution() {
890        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
891        all_files.insert(
892            "foo.h".to_string(),
893            vec![
894                PathBuf::from("/proj/src/foo.h"),
895                PathBuf::from("/proj/src/lib/foo.h"),
896            ],
897        );
898        let current = Path::new("/proj/src/lib/file.c");
899        let result = guess_file(current, "mozilla/foo.h", &all_files);
900        assert_eq!(result, vec![PathBuf::from("/proj/src/lib/foo.h")]);
901    }
902
903    #[test]
904    fn test_guess_language() {
905        let buf = b"// -*- foo: bar; mode: c++; hello: world\n";
906        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
907
908        let buf = b"// -*- c++ -*-\n";
909        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
910
911        let buf = b"// -*- foo: bar; bar-mode: c++; hello: world\n";
912        assert_eq!(
913            guess_language(buf, "foo.py"),
914            (Some(LANG::Python), "python")
915        );
916
917        let buf = b"/* hello world */\n";
918        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
919
920        let buf = b"\n\n\n\n\n\n\n\n\n// vim: set ts=4 ft=c++\n\n\n";
921        assert_eq!(guess_language(buf, "foo.c"), (Some(LANG::Cpp), "c/c++"));
922
923        let buf = b"\n\n\n\n\n\n\n\n\n\n\n\n";
924        assert_eq!(guess_language(buf, "foo.txt"), (None, ""));
925
926        let buf = b"// -*- foo: bar; mode: Objective-C++; hello: world\n";
927        assert_eq!(
928            guess_language(buf, "foo.mm"),
929            (Some(LANG::Cpp), "obj-c/c++")
930        );
931    }
932
933    #[test]
934    fn shebang_bare_bash() {
935        assert_eq!(get_shebang_lang(b"#!/bin/bash\n"), Some(LANG::Bash));
936    }
937
938    #[test]
939    fn shebang_env_python3() {
940        assert_eq!(
941            get_shebang_lang(b"#!/usr/bin/env python3\n"),
942            Some(LANG::Python),
943        );
944    }
945
946    #[test]
947    fn shebang_versioned_perl_with_flag() {
948        assert_eq!(
949            get_shebang_lang(b"#!/usr/bin/perl5.36 -w\n"),
950            Some(LANG::Perl),
951        );
952    }
953
954    #[test]
955    fn shebang_env_dash_s_node() {
956        assert_eq!(
957            get_shebang_lang(b"#!/usr/bin/env -S node --experimental\n"),
958            Some(LANG::Javascript),
959        );
960    }
961
962    #[test]
963    fn shebang_env_with_var_assignment() {
964        // `env FOO=bar python3` — skip the assignment, find the interpreter.
965        assert_eq!(
966            get_shebang_lang(b"#!/usr/bin/env FOO=bar python3\n"),
967            Some(LANG::Python),
968        );
969    }
970
971    #[test]
972    fn shebang_env_dash_u_consumes_next_token() {
973        // `env -u VAR python3` — `-u` is the only `env` short flag that
974        // consumes a following argument (the variable name to unset). Without
975        // the special case, `VAR` would be misidentified as the interpreter.
976        assert_eq!(
977            get_shebang_lang(b"#!/usr/bin/env -u VAR python3\n"),
978            Some(LANG::Python),
979        );
980    }
981
982    #[test]
983    fn shebang_versioned_lua() {
984        assert_eq!(get_shebang_lang(b"#!/usr/bin/lua5.1\n"), Some(LANG::Lua));
985    }
986
987    #[test]
988    fn shebang_node() {
989        assert_eq!(
990            get_shebang_lang(b"#!/usr/local/bin/node\n"),
991            Some(LANG::Javascript),
992        );
993    }
994
995    #[test]
996    fn shebang_tclsh() {
997        assert_eq!(get_shebang_lang(b"#!/usr/bin/tclsh\n"), Some(LANG::Tcl));
998    }
999
1000    #[test]
1001    fn shebang_no_trailing_newline() {
1002        assert_eq!(get_shebang_lang(b"#!/bin/sh"), Some(LANG::Bash));
1003    }
1004
1005    #[test]
1006    fn shebang_crlf_line_ending() {
1007        // guess_language usually receives LF-normalised input, but be defensive.
1008        assert_eq!(get_shebang_lang(b"#!/bin/bash\r\n"), Some(LANG::Bash));
1009    }
1010
1011    #[test]
1012    fn shebang_empty_buffer() {
1013        assert_eq!(get_shebang_lang(b""), None);
1014    }
1015
1016    #[test]
1017    fn shebang_single_byte() {
1018        assert_eq!(get_shebang_lang(b"#"), None);
1019    }
1020
1021    #[test]
1022    fn shebang_no_shebang_prefix() {
1023        assert_eq!(get_shebang_lang(b"// not a shebang\n"), None);
1024    }
1025
1026    #[test]
1027    fn shebang_unknown_interpreter() {
1028        // `ocaml` is a real interpreter the project does not target —
1029        // a stable sentinel for the "shebang names an interpreter
1030        // outside the supported set" case (independent of which
1031        // languages the workspace happens to recognise today).
1032        assert_eq!(get_shebang_lang(b"#!/usr/bin/ocaml\n"), None);
1033    }
1034
1035    #[test]
1036    fn shebang_env_only_no_interpreter() {
1037        assert_eq!(get_shebang_lang(b"#!/usr/bin/env\n"), None);
1038    }
1039
1040    #[test]
1041    fn shebang_non_utf8_returns_none() {
1042        // Invalid UTF-8 on the shebang line must not panic.
1043        assert_eq!(get_shebang_lang(b"#!/usr/bin/\xff\xfe\n"), None);
1044    }
1045
1046    #[test]
1047    fn guess_language_extension_wins_over_shebang() {
1048        // The .py extension must outrank a `#!/bin/sh` shebang.
1049        let buf = b"#!/bin/sh\nprint('hi')\n";
1050        assert_eq!(
1051            guess_language(buf, "foo.py"),
1052            (Some(LANG::Python), "python")
1053        );
1054    }
1055
1056    #[test]
1057    fn guess_language_shebang_falls_through_when_no_extension() {
1058        let buf = b"#!/usr/bin/env python3\nprint('hi')\n";
1059        assert_eq!(guess_language(buf, "run"), (Some(LANG::Python), "python"));
1060    }
1061
1062    #[test]
1063    fn guess_language_shebang_detects_ruby_without_extension() {
1064        // Gem executables under `bin/` are extensionless Ruby scripts
1065        // identified solely by their `#!/usr/bin/env ruby` shebang.
1066        let buf = b"#!/usr/bin/env ruby\nputs 'hi'\n";
1067        assert_eq!(guess_language(buf, "run"), (Some(LANG::Ruby), "ruby"));
1068    }
1069
1070    #[test]
1071    fn guess_language_shebang_detects_elixir_without_extension() {
1072        // Extensionless Elixir scripts (`#!/usr/bin/env elixir`) must be
1073        // identified by their shebang alone — regression for #186.
1074        let buf = b"#!/usr/bin/env elixir\nIO.puts(\"hi\")\n";
1075        assert_eq!(guess_language(buf, "run"), (Some(LANG::Elixir), "elixir"));
1076    }
1077
1078    #[test]
1079    fn guess_language_shebang_detects_iex_without_extension() {
1080        // `iex` is Elixir's interactive shell; scripts that drive it via
1081        // `#!/usr/bin/env iex` should also map to Elixir.
1082        let buf = b"#!/usr/bin/env iex\nIO.puts(\"hi\")\n";
1083        assert_eq!(guess_language(buf, "run"), (Some(LANG::Elixir), "elixir"));
1084    }
1085
1086    #[test]
1087    fn guess_language_shebang_loses_to_mode_line() {
1088        // Mode line outranks the shebang.
1089        let buf = b"#!/usr/bin/env node\n# -*- mode: python -*-\n";
1090        assert_eq!(guess_language(buf, "run"), (Some(LANG::Python), "python"));
1091    }
1092
1093    #[test]
1094    fn normalize_line_endings_normalizes_crlf() {
1095        let mut d = b"code\r\n# comment\r\n".to_vec();
1096        normalize_line_endings(&mut d);
1097        assert_eq!(d, b"code\n# comment\n");
1098    }
1099
1100    #[test]
1101    fn normalize_line_endings_normalizes_lone_cr() {
1102        let mut d = b"code\r# comment\r".to_vec();
1103        normalize_line_endings(&mut d);
1104        assert_eq!(d, b"code\n# comment\n");
1105    }
1106
1107    #[test]
1108    fn normalize_line_endings_normalizes_cr_before_crlf() {
1109        // lone CR followed immediately by CRLF → two separate line breaks
1110        let mut d = b"a\r\r\nb".to_vec();
1111        normalize_line_endings(&mut d);
1112        assert_eq!(d, b"a\n\nb\n");
1113    }
1114
1115    #[test]
1116    fn normalize_line_endings_normalizes_crlf_blank_line() {
1117        let mut d = b"a\r\n\r\nb\r\n".to_vec();
1118        normalize_line_endings(&mut d);
1119        assert_eq!(d, b"a\n\nb\n");
1120    }
1121
1122    #[test]
1123    fn normalize_line_endings_empty_buffer() {
1124        let mut d = b"".to_vec();
1125        normalize_line_endings(&mut d);
1126        assert_eq!(d, b"\n");
1127    }
1128
1129    #[test]
1130    fn is_generated_at_generated_top() {
1131        assert!(is_generated(b"// @generated\nfn x() {}\n"));
1132    }
1133
1134    #[test]
1135    fn is_generated_go_do_not_edit() {
1136        assert!(is_generated(
1137            b"// Code generated by protoc. DO NOT EDIT.\npackage x\n",
1138        ));
1139    }
1140
1141    #[test]
1142    fn is_generated_lizard_marker() {
1143        assert!(is_generated(b"# GENERATED CODE\nprint('x')\n"));
1144    }
1145
1146    #[test]
1147    fn is_generated_python_do_not_edit() {
1148        assert!(is_generated(b"# DO NOT EDIT\nprint('x')\n"));
1149    }
1150
1151    #[test]
1152    fn is_generated_case_insensitive_marker() {
1153        assert!(is_generated(b"// @GENERATED\nfn x() {}\n"));
1154    }
1155
1156    #[test]
1157    fn is_generated_marker_only_in_body_is_false() {
1158        // Marker phrase appearing well past the scan window must not trigger.
1159        let mut buf = Vec::with_capacity(8 * 1024);
1160        for i in 0..200 {
1161            buf.extend_from_slice(format!("// line {i}\n").as_bytes());
1162        }
1163        buf.extend_from_slice(b"// @generated  -- but this is line 200+\n");
1164        assert!(!is_generated(&buf));
1165    }
1166
1167    #[test]
1168    fn is_generated_empty_file_is_false() {
1169        assert!(!is_generated(b""));
1170    }
1171
1172    #[test]
1173    fn is_generated_non_utf8_does_not_panic() {
1174        // Non-UTF-8 garbage with no ASCII-marker substring: every byte is
1175        // 0x80..=0xFF (continuation / invalid in UTF-8 lead positions), so
1176        // it cannot contain `@generated`, `DO NOT EDIT`, or `GENERATED CODE`
1177        // as a byte sequence. Verifies both no-panic and the negative
1178        // result.
1179        let buf: Vec<u8> = (0x80u8..=0xFFu8).cycle().take(2048).collect();
1180        assert!(!is_generated(&buf));
1181    }
1182
1183    #[test]
1184    fn is_generated_short_file_with_marker() {
1185        // File smaller than the scan window with a marker on the first line.
1186        assert!(is_generated(b"# @generated"));
1187    }
1188
1189    #[test]
1190    fn is_generated_utf8_bom_then_marker() {
1191        let mut buf = Vec::new();
1192        buf.extend_from_slice(b"\xEF\xBB\xBF");
1193        buf.extend_from_slice(b"// @generated\nfn x() {}\n");
1194        assert!(is_generated(&buf));
1195    }
1196
1197    #[test]
1198    fn is_generated_no_marker_returns_false() {
1199        assert!(!is_generated(
1200            b"// Hand-written file.\nfn main() { println!(\"hi\"); }\n"
1201        ));
1202    }
1203
1204    #[test]
1205    fn normalize_line_endings_mixed_endings() {
1206        // LF + lone-CR + CRLF in one buffer — each is converted independently.
1207        let mut d = b"a\nb\rc\r\nd".to_vec();
1208        normalize_line_endings(&mut d);
1209        assert_eq!(d, b"a\nb\nc\nd\n");
1210    }
1211}
big_code_analysis/tools.rs

big_code_analysis/
tools.rs