big_code_analysis/
tools.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::wildcard_imports, clippy::enum_glob_use)]
8// Metric counts (token, function, branch, argument, etc.) are stored as
9// `usize` and crossed with `f64` averages, ratios, and Halstead scores
10// across the cyclomatic / MI / Halstead computations. The `usize as f64`
11// and `f64 as usize` casts are intentional and snapshot-anchored — every
12// site is bounded by the count it came from. Allowing the lints at the
13// module level keeps the metric arithmetic legible.
14#![allow(
15    clippy::cast_precision_loss,
16    clippy::cast_possible_truncation,
17    clippy::cast_sign_loss
18)]
19
20use std::cmp::Ordering;
21use std::collections::HashMap;
22use std::fs::{self, File};
23use std::io::{Read, Write};
24use std::path::{Component, Path, PathBuf};
25use std::sync::OnceLock;
26
27use regex::bytes::Regex;
28use termcolor::{Color, ColorSpec, WriteColor};
29
30use crate::langs::*;
31
32/// Reads a file, normalising all CR-only and CRLF line endings to LF.
33///
34/// **Note for downstream consumers**: the returned buffer never contains `\r`
35/// bytes. Callers that previously observed raw `\r\n` sequences will see plain
36/// `\n` after this call. This is intentional — the metric engine requires LF-
37/// only input — but it is a behavioural difference from a plain `fs::read`.
38///
39/// # Errors
40///
41/// Returns any [`std::io::Error`] surfaced by [`File::open`] (the
42/// path is missing, lacks read permission, is a directory, …) or by
43/// [`File::read_to_end`] while reading the file contents.
44///
45/// # Examples
46///
47/// ```
48/// use std::path::Path;
49///
50/// use big_code_analysis::read_file;
51///
52/// let path = Path::new("Cargo.toml");
53/// read_file(&path).unwrap();
54/// ```
55pub fn read_file(path: &Path) -> std::io::Result<Vec<u8>> {
56    let mut file = File::open(path)?;
57    let mut data = Vec::new();
58    file.read_to_end(&mut data)?;
59
60    normalize_line_endings(&mut data);
61
62    Ok(data)
63}
64
65/// Bytes from the start of the file probed to decide whether the contents
66/// look like UTF-8 before the whole file is read. A small fixed window keeps
67/// the rejection of obviously-binary files cheap; the last character of the
68/// window may be a multibyte sequence split by this boundary, which the
69/// classifier tolerates only when more file follows (see `read_file_with_eol`).
70const UTF8_PROBE_BYTES: usize = 64;
71
72/// Decides whether a file's probe prefix is decodable UTF-8 and where its
73/// real content starts. Returns the post-BOM content slice when the probe
74/// is acceptable, or `None` when the file should be skipped.
75///
76/// A UTF-16 BE/LE BOM marks a file whose body is interleaved-NUL UTF-16,
77/// which the metric engine cannot parse: stripping the BOM and continuing
78/// would let the ASCII-dominant body pass the UTF-8 probe (each NUL is a
79/// valid single-byte UTF-8 scalar) and reach the parser as garbage (issue
80/// #803). Skip such files, mirroring `is_generated`'s documented stance
81/// that UTF-16 source is unsupported. A UTF-8 BOM, by contrast, prefixes
82/// genuine UTF-8 and is stripped so the body parses normally.
83/// `starts_with` is bounds-safe for a probe shorter than the BOM.
84///
85/// Validation is at the byte level rather than via a lossy string
86/// round-trip. The probe is only the first `UTF8_PROBE_BYTES`, so a file
87/// longer than the probe may legitimately have its last multibyte
88/// character split across the window boundary. `String::from_utf8_lossy`
89/// could not distinguish that benign truncation (issue #746) from a real
90/// encoding error, and its replacement character `U+FFFD` collided with
91/// the same scalar appearing legitimately in the source (issue #758).
92/// `probe_truncated` is true only when the file continues past the probe;
93/// when the probe is the whole file there is no more data to complete a
94/// trailing partial sequence, so such a sequence is genuine corruption.
95fn probe_decodable_prefix(start: &[u8], file_size: usize, probe_len: usize) -> Option<&[u8]> {
96    let start = if start.starts_with(b"\xFE\xFF") || start.starts_with(b"\xFF\xFE") {
97        return None;
98    } else if let Some(rest) = start.strip_prefix(b"\xEF\xBB\xBF") {
99        rest
100    } else {
101        start
102    };
103
104    let probe_truncated = file_size > probe_len;
105    match std::str::from_utf8(start) {
106        Ok(_) => {}
107        // Only a trailing incomplete multibyte sequence: the bytes before
108        // `valid_up_to()` are valid UTF-8 and the truncated tail is
109        // completed by data later in the file.
110        Err(e) if e.error_len().is_none() && probe_truncated => {}
111        Err(_) => return None,
112    }
113    Some(start)
114}
115
116/// Reads a file, normalising all CR-only and CRLF line endings to LF, and ensures
117/// the buffer ends with exactly one `\n`. Returns `None` for files ≤ 3 bytes or
118/// files that appear to be non-UTF-8.
119///
120/// # Errors
121///
122/// Returns any [`std::io::Error`] surfaced by [`File::open`] (the
123/// path is missing, lacks read permission, is a directory, …) or by
124/// the subsequent reads from the open file handle. A clean short read
125/// during the probe (`UnexpectedEof`) yields `Ok(None)`; any other
126/// `read_exact` error kind propagates as `Err`. A non-UTF-8 head, a
127/// too-small file, or a UTF-16 BE/LE BOM is reported via `Ok(None)`,
128/// not an error.
129///
130/// # Examples
131///
132/// ```
133/// use std::path::Path;
134///
135/// use big_code_analysis::read_file_with_eol;
136///
137/// let path = Path::new("Cargo.toml");
138/// read_file_with_eol(&path).unwrap();
139/// ```
140pub fn read_file_with_eol(path: &Path) -> std::io::Result<Option<Vec<u8>>> {
141    let file_size = fs::metadata(path).map_or(1024 * 1024, |m| m.len() as usize);
142    if file_size <= 3 {
143        // this file is very likely almost empty... so nothing to do on it
144        return Ok(None);
145    }
146
147    let mut file = File::open(path)?;
148
149    let probe_len = UTF8_PROBE_BYTES.min(file_size);
150    let mut start = vec![0; probe_len];
151    // A clean short read (the file shrank below the probe between the
152    // `metadata` call and here) is reported as `Ok(None)`, matching the
153    // too-small-file case. Any other `read_exact` failure — a real I/O
154    // fault such as a permission or hardware error — must propagate as
155    // `Err` per the documented contract (issue #804); collapsing every
156    // error to `Ok(None)` would silently swallow genuine read failures.
157    match file.read_exact(&mut start) {
158        Ok(()) => {}
159        Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => return Ok(None),
160        Err(e) => return Err(e),
161    }
162
163    // Sniff the probe: reject UTF-16 / corrupt files, strip a UTF-8 BOM,
164    // and anchor the buffer at the post-BOM content. `None` means skip the
165    // file (see `probe_decodable_prefix`).
166    let Some(start) = probe_decodable_prefix(&start, file_size, probe_len) else {
167        return Ok(None);
168    };
169
170    let mut data = Vec::with_capacity(file_size + 2);
171    data.extend_from_slice(start);
172
173    file.read_to_end(&mut data)?;
174
175    normalize_line_endings(&mut data);
176
177    Ok(Some(data))
178}
179
180/// Normalises an in-memory source buffer to match the [`read_file_with_eol`]
181/// on-disk path: all CR-only and CRLF line endings become LF, and the buffer is
182/// guaranteed to end with exactly one `\n`.
183///
184/// In-memory entry points (the web server's JSON/octet-stream payloads, the
185/// Python `analyze_source` bindings) feed caller-supplied bytes straight to the
186/// parser, whereas the CLI reads files through [`read_file_with_eol`]. Without
187/// this step, identical content yields different metrics across surfaces — an
188/// editor buffer with no trailing newline reports `sloc: 0` over the wire but
189/// `sloc: 1` from the CLI on the same bytes (issue #640). Run the buffer through
190/// this helper before parsing so every surface computes the canonical numbers.
191///
192/// Returns a fresh owned buffer; the input is consumed so the common case
193/// (already-owned request body) reuses its allocation.
194///
195/// # Examples
196///
197/// ```
198/// use big_code_analysis::normalize_eol;
199///
200/// // CRLF endings collapse to LF and a missing final newline is added.
201/// assert_eq!(normalize_eol(b"a\r\nb".to_vec()), b"a\nb\n");
202/// ```
203#[must_use]
204pub fn normalize_eol(mut data: Vec<u8>) -> Vec<u8> {
205    normalize_line_endings(&mut data);
206    data
207}
208
209/// Writes data to a file.
210///
211/// # Errors
212///
213/// Returns any [`std::io::Error`] surfaced by [`File::create`]
214/// (parent directory missing, lacks write permission, target is a
215/// directory, …) or by [`File::write_all`] while writing the buffer.
216///
217/// # Examples
218///
219/// ```no_run
220/// use std::path::Path;
221///
222/// use big_code_analysis::write_file;
223///
224/// let path = Path::new("foo.txt");
225/// let data: [u8; 4] = [0; 4];
226/// write_file(&path, &data).unwrap();
227/// ```
228pub fn write_file(path: &Path, data: &[u8]) -> std::io::Result<()> {
229    let mut file = File::create(path)?;
230    file.write_all(data)?;
231
232    Ok(())
233}
234
235/// Detects the language of a code using
236/// the extension of a file.
237///
238/// # Examples
239///
240/// ```
241/// use std::path::Path;
242///
243/// use big_code_analysis::get_language_for_file;
244///
245/// let path = Path::new("build.rs");
246/// get_language_for_file(&path).unwrap();
247/// ```
248#[must_use]
249pub fn get_language_for_file(path: &Path) -> Option<LANG> {
250    if let Some(ext) = path.extension() {
251        let ext = ext.to_str()?.to_lowercase();
252        get_from_ext(&ext)
253    } else {
254        None
255    }
256}
257
258fn mode_to_str(mode: &[u8]) -> Option<String> {
259    std::str::from_utf8(mode).ok().map(str::to_lowercase)
260}
261
262// comment containing coding info are useful
263static RE1_EMACS: OnceLock<Regex> = OnceLock::new();
264static RE2_EMACS: OnceLock<Regex> = OnceLock::new();
265static RE1_VIM: OnceLock<Regex> = OnceLock::new();
266static RE_GENERATED: OnceLock<Regex> = OnceLock::new();
267
268// Regular expressions
269const FIRST_EMACS_EXPRESSION: &str = r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)";
270const SECOND_EMACS_EXPRESSION: &str = r"-\*-\s*([^:;\s]+)\s*-\*-";
271const VIM_EXPRESSION: &str = r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)";
272
273// Generated-code marker patterns. Matched against the leading window of the
274// file (see `is_generated`) so a marker phrase deep in the body does not
275// trigger a skip. Each alternative covers a widely-used convention:
276//
277// - `@generated`      — Facebook / Meta convention, also used by buck2,
278//                       rustfmt, prettier, and many code generators.
279// - `DO NOT EDIT`     — Go's `Code generated ... DO NOT EDIT.` line is
280//                       canonical, but the bare phrase appears in Bazel,
281//                       protoc, OpenAPI clients, etc. — match either.
282// - `GENERATED CODE`  — Lizard's marker; preserved for compatibility with
283//                       projects that already tag generated files this way.
284const GENERATED_EXPRESSION: &str = r"(?i)@generated\b|DO NOT EDIT|GENERATED CODE";
285
286/// Bytes from the start of the file scanned for a generated-code marker.
287/// 5 KiB is enough to cover any reasonable file header (license + autogen
288/// preamble) without paying a meaningful read cost.
289const GENERATED_SCAN_BYTES: usize = 5 * 1024;
290/// Maximum lines scanned for a generated-code marker. Caps the work on a
291/// pathological "all-on-one-line" file.
292const GENERATED_SCAN_LINES: usize = 50;
293
294/// Returns `true` when `buf` looks like generated code: its leading window
295/// (first ~50 lines or first 5 KiB, whichever is smaller) contains a known
296/// marker phrase. Matching is case-insensitive for the marker and never
297/// allocates on the negative path.
298///
299/// Recognized markers:
300///
301/// - `@generated` — Facebook / Meta convention, also used by buck2,
302///   rustfmt, and prettier.
303/// - `DO NOT EDIT` — Go's `Code generated by ... DO NOT EDIT.` is the
304///   canonical form; the bare phrase is also widely copied.
305/// - `GENERATED CODE` — Lizard's marker, preserved for compatibility.
306///
307/// Detection runs against raw bytes before parsing, so callers can discard
308/// generated files without paying tree-sitter parse cost. Non-UTF-8 input
309/// will not panic — `regex::bytes::Regex` operates on the raw byte slice.
310///
311/// # Examples
312///
313/// ```
314/// use big_code_analysis::is_generated;
315///
316/// assert!(is_generated(b"// @generated\nfn x() {}\n"));
317/// assert!(is_generated(
318///     b"// Code generated by protoc. DO NOT EDIT.\npackage x\n",
319/// ));
320/// assert!(!is_generated(b"fn main() { /* not generated */ }\n"));
321/// ```
322///
323/// # Panics
324///
325/// Panics if the embedded marker regex set fails to build; the marker
326/// list is a static literal so this represents a compile-time bug, not
327/// a runtime input that can be handled.
328pub fn is_generated(buf: &[u8]) -> bool {
329    // Strip a leading UTF-8 BOM so a marker on the first line of a
330    // BOM-prefixed file still matches against the line start. UTF-16 BOMs
331    // are not handled: the byte-pattern regex cannot match the
332    // interleaved-zero encoding (`@\x00g\x00...`) that follows a UTF-16
333    // BOM, so a strip would not enable detection — it would only obscure
334    // the fact that UTF-16 source files are unsupported here.
335    let buf = buf.strip_prefix(b"\xEF\xBB\xBF").unwrap_or(buf);
336
337    // Bound the search window: at most GENERATED_SCAN_BYTES bytes, and
338    // among those, stop after GENERATED_SCAN_LINES newlines. Scanning fewer
339    // lines avoids matching a marker phrase deep in the file body (the
340    // negative case in the issue's acceptance criteria).
341    let cap = buf.len().min(GENERATED_SCAN_BYTES);
342    let end = buf[..cap]
343        .iter()
344        .enumerate()
345        .filter_map(|(i, &b)| (b == b'\n').then_some(i + 1))
346        .nth(GENERATED_SCAN_LINES - 1)
347        .unwrap_or(cap);
348    let window = &buf[..end];
349
350    RE_GENERATED
351        .get_or_init(|| {
352            Regex::new(GENERATED_EXPRESSION).expect("GENERATED_EXPRESSION is a constant regex")
353        })
354        .is_match(window)
355}
356
357#[inline]
358fn get_regex<'a>(
359    once_lock: &OnceLock<Regex>,
360    line: &'a [u8],
361    regex: &'a str,
362) -> Option<regex::bytes::Captures<'a>> {
363    once_lock
364        .get_or_init(|| Regex::new(regex).expect("constant regex pattern must compile"))
365        .captures_iter(line)
366        .next()
367}
368
369/// Resolves a language from a script's shebang line.
370///
371/// Returns `None` unless `buf` starts with `#!`. Reads up to the first `\n`,
372/// strips an optional trailing `\r`, splits on whitespace, and takes the
373/// basename of either the first token or — when that basename is `env` — the
374/// next non-flag token. Trailing version digits and dots (`python3`,
375/// `lua5.1`, `perl5.36`) are stripped before lookup. Non-UTF-8 bytes on the
376/// shebang line yield `None` (no panic).
377fn get_shebang_lang(buf: &[u8]) -> Option<LANG> {
378    // Early-out for the common case (any non-shebang buffer): no allocation,
379    // no UTF-8 decoding.
380    let rest = buf.strip_prefix(b"#!")?;
381    let line_end = rest.iter().position(|&b| b == b'\n').unwrap_or(rest.len());
382    let line = &rest[..line_end];
383    // Trim a trailing CR even though normalize_line_endings should have removed
384    // it — guess_language is on the public API and may be called with raw input.
385    let line = line.strip_suffix(b"\r").unwrap_or(line);
386    let line = std::str::from_utf8(line).ok()?;
387
388    let mut tokens = line.split_ascii_whitespace();
389    let first_base = basename(tokens.next()?);
390
391    let interpreter = if first_base == "env" {
392        skip_env_args(&mut tokens)?
393    } else {
394        first_base
395    };
396
397    get_from_interpreter(strip_version_suffix(interpreter))
398}
399
400// Walk past leading `env` arguments (`-FLAG`, `-u VAR`, `NAME=value`) and
401// return the basename of the actual interpreter token. Per `env(1)`, only
402// `-u` consumes a following argument; other short flags (`-i`, `-S`, …)
403// stand alone or carry their argument inline (e.g. `-S "node --foo"`).
404fn skip_env_args<'a>(tokens: &mut std::str::SplitAsciiWhitespace<'a>) -> Option<&'a str> {
405    loop {
406        let tok = tokens.next()?;
407        if let Some(flag) = tok.strip_prefix('-') {
408            if flag == "u" {
409                tokens.next()?;
410            }
411            continue;
412        }
413        if tok.contains('=') {
414            continue;
415        }
416        return Some(basename(tok));
417    }
418}
419
420fn basename(path: &str) -> &str {
421    path.rsplit_once('/').map_or(path, |(_, name)| name)
422}
423
424/// Strips a trailing run of digits and dots used to encode an interpreter
425/// version (`python3` → `python`, `lua5.1` → `lua`, `perl5.36` → `perl`).
426fn strip_version_suffix(name: &str) -> &str {
427    let trimmed = name.trim_end_matches(|c: char| c.is_ascii_digit() || c == '.');
428    if trimmed.is_empty() { name } else { trimmed }
429}
430
431fn get_from_interpreter(name: &str) -> Option<LANG> {
432    match name {
433        "sh" | "bash" | "dash" | "ksh" | "zsh" => Some(LANG::Bash),
434        "python" => Some(LANG::Python),
435        "perl" => Some(LANG::Perl),
436        "lua" | "luajit" => Some(LANG::Lua),
437        "php" | "php-cgi" => Some(LANG::Php),
438        "node" | "nodejs" => Some(LANG::Javascript),
439        "tclsh" | "wish" => Some(LANG::Tcl),
440        "ruby" => Some(LANG::Ruby),
441        "elixir" | "iex" => Some(LANG::Elixir),
442        _ => None,
443    }
444}
445
446// Editors place mode/file-local-variable lines near the very top or
447// very bottom of a file. Emacs honours the first non-shebang line and a
448// trailing "Local Variables:" block; Vim honours modelines in the first
449// or last few lines (`modelines` defaults to 5). Scanning this many real
450// lines at each end covers both conventions without trawling the body.
451const MODE_LINE_SCAN_WINDOW: usize = 5;
452
453fn get_emacs_mode(buf: &[u8]) -> Option<String> {
454    // Forward scan: the first `MODE_LINE_SCAN_WINDOW` real lines may carry
455    // an emacs `-*- … -*-` header or a Vim modeline. `split` yields one
456    // element per line (no unbounded remainder), and `take` bounds the
457    // window precisely — the former `splitn(5)` + `i == 3` break inspected
458    // only 4 lines yet split off a 5th unbounded remainder (issue #709).
459    for line in buf.split(|c| *c == b'\n').take(MODE_LINE_SCAN_WINDOW) {
460        if let Some(cap) = get_regex(&RE1_EMACS, line, FIRST_EMACS_EXPRESSION) {
461            return mode_to_str(&cap[1]);
462        } else if let Some(cap) = get_regex(&RE2_EMACS, line, SECOND_EMACS_EXPRESSION) {
463            return mode_to_str(&cap[1]);
464        } else if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
465            return mode_to_str(&cap[1]);
466        }
467    }
468
469    // Backward scan for a trailing Vim modeline. Skip empty pieces so a
470    // trailing newline (the common case after `read_file_with_eol`) and
471    // any trailing blank lines do not consume the window before a real
472    // modeline is reached — the former `rsplitn(5)` spent its first slot
473    // on that empty piece, covering fewer than the intended real lines.
474    for line in buf
475        .rsplit(|c| *c == b'\n')
476        .filter(|line| !line.is_empty())
477        .take(MODE_LINE_SCAN_WINDOW)
478    {
479        if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
480            return mode_to_str(&cap[1]);
481        }
482    }
483
484    None
485}
486
487/// Guesses the language of a code.
488///
489/// Returns a tuple containing a [`LANG`] as first argument
490/// and the language name as a second one.
491///
492/// # Examples
493///
494/// ```
495/// use std::path::PathBuf;
496///
497/// use big_code_analysis::guess_language;
498///
499/// let source_code = "int a = 42;";
500///
501/// // The path to a dummy file used to contain the source code
502/// let path = PathBuf::from("foo.c");
503/// let source_slice = source_code.as_bytes();
504///
505/// // Guess the language of a code
506/// guess_language(&source_slice, &path);
507/// ```
508///
509/// [`LANG`]: enum.LANG.html
510pub fn guess_language<P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, &'static str) {
511    let ext = path
512        .as_ref()
513        .extension()
514        .and_then(|e| e.to_str())
515        .map(str::to_lowercase)
516        .unwrap_or_default();
517    let from_ext = get_from_ext(&ext);
518
519    let mode = get_emacs_mode(buf).unwrap_or_default();
520
521    let from_mode = get_from_emacs_mode(&mode);
522
523    if let Some(lang_ext) = from_ext {
524        if let Some(lang_mode) = from_mode {
525            if lang_ext == lang_mode {
526                (Some(lang_mode), lang_mode.name())
527            } else {
528                // we should probably rely on extension here
529                (Some(lang_ext), lang_ext.name())
530            }
531        } else {
532            (Some(lang_ext), lang_ext.name())
533        }
534    } else if let Some(lang_mode) = from_mode {
535        (Some(lang_mode), lang_mode.name())
536    } else if let Some(lang_shebang) = get_shebang_lang(buf) {
537        (Some(lang_shebang), lang_shebang.name())
538    } else {
539        (None, "")
540    }
541}
542
543/// Normalises all CR-only and CRLF line endings to LF throughout the buffer,
544/// then ensures the buffer ends with exactly one `\n`.
545pub(crate) fn normalize_line_endings(data: &mut Vec<u8>) {
546    // In-place compaction: write pointer stays ≤ read pointer, so no extra allocation.
547    let mut w = 0;
548    let mut r = 0;
549    while r < data.len() {
550        if data[r] == b'\r' {
551            data[w] = b'\n';
552            w += 1;
553            r += if data.get(r + 1).copied() == Some(b'\n') {
554                2
555            } else {
556                1
557            };
558        } else {
559            data[w] = data[r];
560            w += 1;
561            r += 1;
562        }
563    }
564    data.truncate(w);
565    let trailing = data.iter().rev().take_while(|&&c| c == b'\n').count();
566    data.truncate(data.len() - trailing);
567    data.push(b'\n');
568}
569
570pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
571    // Copied from Cargo sources: https://github.com/rust-lang/cargo/blob/master/src/cargo/util/paths.rs#L65
572    let mut components = path.as_ref().components().peekable();
573    let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().copied() {
574        components.next();
575        PathBuf::from(c.as_os_str())
576    } else {
577        PathBuf::new()
578    };
579
580    for component in components {
581        match component {
582            // A `Prefix` (Windows drive / UNC) component only ever
583            // appears first; the leading peek+next above already
584            // consumed it, so it cannot recur in this loop.
585            Component::Prefix(..) => unreachable!(),
586            Component::RootDir => {
587                ret.push(component.as_os_str());
588            }
589            Component::CurDir => {}
590            Component::ParentDir => {
591                ret.pop();
592            }
593            Component::Normal(c) => {
594                ret.push(c);
595            }
596        }
597    }
598    ret
599}
600
601pub(crate) fn get_paths_dist(path1: &Path, path2: &Path) -> Option<usize> {
602    for ancestor in path1.ancestors() {
603        if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
604            // `ancestor` is yielded by `path1.ancestors()`, so it is
605            // a prefix of `path1` by construction; `path2` was just
606            // verified by `starts_with` above. Both `strip_prefix`
607            // calls are therefore infallible.
608            let path1 = path1
609                .strip_prefix(ancestor)
610                .expect("ancestor is by construction a prefix of path1");
611            let path2 = path2
612                .strip_prefix(ancestor)
613                .expect("ancestor verified by starts_with above");
614            return Some(path1.components().count() + path2.components().count());
615        }
616    }
617    None
618}
619
620pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
621    current_path: &Path,
622    include_path: &str,
623    all_files: &HashMap<String, Vec<PathBuf>, S>,
624) -> Vec<PathBuf> {
625    let include_path = include_path
626        .strip_prefix("mozilla/")
627        .unwrap_or(include_path);
628
629    // Resolve the include relative to the including file's parent
630    // before normalizing. This preserves leading `..` traversal so
631    // `#include "../foo.h"` from `src/lib/file.c` targets
632    // `src/foo.h`, not the lexically-popped `foo.h` (issue #297).
633    // Lexical-only normalization is required because `current_path`
634    // and the entries in `all_files` are typically not canonicalized
635    // and the included header need not exist on disk yet.
636    let resolved_path = current_path
637        .parent()
638        .map(|parent| normalize_path(parent.join(include_path)));
639
640    let include_path = normalize_path(include_path);
641    let Some(file_name) = include_path.file_name().and_then(|n| n.to_str()) else {
642        return vec![];
643    };
644    let Some(possibilities) = all_files.get(file_name) else {
645        return vec![];
646    };
647    if possibilities.len() == 1 {
648        return possibilities.clone();
649    }
650
651    // Strategy chain: each step looks for a UNIQUE candidate that
652    // matches a progressively weaker signal (full resolved target →
653    // suffix on the normalized include → siblings of the including
654    // file). When no step yields a unique match, fall back to the
655    // closest by path distance, which may return zero or many.
656    resolve_against_resolved(possibilities, current_path, resolved_path.as_deref())
657        .or_else(|| unique_filter(possibilities, current_path, |p| p.ends_with(&include_path)))
658        .or_else(|| resolve_against_parent(possibilities, current_path))
659        .unwrap_or_else(|| min_distance_candidates(possibilities, current_path))
660}
661
662/// Filter `possibilities` to those satisfying `pred` and distinct
663/// from `current_path`, returning `Some(matched)` only when exactly
664/// one survives. The cascading caller treats `None` as "this strategy
665/// did not yield a unique resolution — try the next one."
666fn unique_filter<F>(possibilities: &[PathBuf], current_path: &Path, pred: F) -> Option<Vec<PathBuf>>
667where
668    F: Fn(&PathBuf) -> bool,
669{
670    let matched: Vec<PathBuf> = possibilities
671        .iter()
672        .filter(|p| current_path != p.as_path() && pred(p))
673        .cloned()
674        .collect();
675    (matched.len() == 1).then_some(matched)
676}
677
678/// Strongest signal: a candidate matches the fully resolved relative
679/// target. Prefer exact equality, then suffix match (so absolute
680/// `all_files` entries still match a relative resolved target like
681/// `src/foo.h`).
682fn resolve_against_resolved(
683    possibilities: &[PathBuf],
684    current_path: &Path,
685    resolved: Option<&Path>,
686) -> Option<Vec<PathBuf>> {
687    let resolved = resolved?;
688    unique_filter(possibilities, current_path, |p| p == resolved)
689        .or_else(|| unique_filter(possibilities, current_path, |p| p.ends_with(resolved)))
690}
691
692/// Candidate-in-same-directory heuristic: keep entries whose path
693/// starts with the including file's parent directory.
694fn resolve_against_parent(possibilities: &[PathBuf], current_path: &Path) -> Option<Vec<PathBuf>> {
695    let parent = current_path.parent()?;
696    unique_filter(possibilities, current_path, |p| p.starts_with(parent))
697}
698
699/// Last-chance fallback in the `guess_file` strategy chain: returns
700/// every candidate whose `get_paths_dist` from `current_path` ties
701/// the minimum, or an empty `Vec` when no candidate has a defined
702/// distance. Unlike the unique-match strategies, this may
703/// legitimately return zero or many entries — its result is the
704/// function's final answer, not a "try the next strategy" signal.
705fn min_distance_candidates(possibilities: &[PathBuf], current_path: &Path) -> Vec<PathBuf> {
706    // Hold survivors as borrows during the walk: `Less` arms clear the
707    // prior set without dropping owned `PathBuf`s, and the trailing
708    // `cloned()` runs exactly once per final survivor — never on
709    // entries that were tentatively kept and later evicted.
710    let mut dist_min = usize::MAX;
711    let mut path_min: Vec<&PathBuf> = Vec::new();
712    for p in possibilities {
713        if current_path == p {
714            continue;
715        }
716        let Some(dist) = get_paths_dist(current_path, p) else {
717            continue;
718        };
719        match dist.cmp(&dist_min) {
720            Ordering::Less => {
721                dist_min = dist;
722                path_min.clear();
723                path_min.push(p);
724            }
725            Ordering::Equal => path_min.push(p),
726            Ordering::Greater => {}
727        }
728    }
729    path_min.into_iter().cloned().collect()
730}
731
732// Accept `&mut dyn WriteColor` rather than `&mut StandardStreamLock` so
733// tests (e.g. `function::dump_spans`) can substitute `termcolor::NoColor`
734// over a `Vec<u8>` to capture the rendered bytes. Production callers
735// continue to pass `&mut StandardStreamLock`, which unsized-coerces to
736// the trait object at the call site.
737#[inline]
738pub(crate) fn color(stdout: &mut dyn WriteColor, color: Color) -> std::io::Result<()> {
739    stdout.set_color(ColorSpec::new().set_fg(Some(color)))
740}
741
742#[inline]
743pub(crate) fn intense_color(stdout: &mut dyn WriteColor, color: Color) -> std::io::Result<()> {
744    stdout.set_color(ColorSpec::new().set_fg(Some(color)).set_intense(true))
745}
746
747#[cfg(test)]
748pub(crate) fn check_func_space<T: crate::ParserTrait, F: Fn(crate::FuncSpace)>(
749    source: &str,
750    filename: &str,
751    check: F,
752) {
753    let path = std::path::PathBuf::from(filename);
754    // Mirror the CRLF/CR normalisation that read_file_with_eol applies via normalize_line_endings
755    let normalized = source.replace("\r\n", "\n").replace('\r', "\n");
756    let mut trimmed_bytes = normalized.trim_end().trim_matches('\n').as_bytes().to_vec();
757    trimmed_bytes.push(b'\n');
758    let parser = T::new(trimmed_bytes, &path, None);
759    let func_space = crate::spaces::metrics_inner(
760        &parser,
761        path.to_str().map(str::to_owned),
762        crate::MetricsOptions::default(),
763    )
764    .unwrap();
765
766    check(func_space);
767}
768
769#[cfg(test)]
770pub(crate) fn check_metrics<T: crate::ParserTrait>(
771    source: &str,
772    filename: &str,
773    check: fn(crate::CodeMetrics) -> (),
774) {
775    check_func_space::<T, _>(source, filename, |func_space| check(func_space.metrics));
776}
777
778/// Asserts that `func_space` has a direct child space named `name` and that
779/// its `kind` matches `expected`.
780///
781/// Used by annotation-type / class / interface tests that need to verify
782/// the structural FuncSpace tree (not just metric values), since vacuous
783/// metric assertions can pass even when `is_func_space` has been reverted
784/// for the node kind under test.
785#[cfg(test)]
786pub(crate) fn assert_child_space_kind(
787    func_space: &crate::FuncSpace,
788    name: &str,
789    expected: crate::SpaceKind,
790) {
791    let child = func_space
792        .spaces
793        .iter()
794        .find(|s| s.name.as_deref() == Some(name))
795        .unwrap_or_else(|| panic!("expected a child FuncSpace named {name:?}"));
796    assert_eq!(
797        child.kind, expected,
798        "child FuncSpace {name:?} kind: got {:?}, expected {:?}",
799        child.kind, expected,
800    );
801}
802
803#[cfg(test)]
804#[path = "tools_tests.rs"]
805mod tests;
big_code_analysis/tools.rs

big_code_analysis/
tools.rs