Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{
13    CleanupPolicy, CleanupPolicyStore, RegistryEntry, ScanRegistry, ScanSummarySnapshot,
14    WatchedDirsStore,
15};
16
17use std::collections::{BTreeMap, BTreeSet, HashSet};
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
21use std::sync::Arc;
22
23use anyhow::{Context, Result};
24use chrono::{DateTime, Utc};
25use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
26use globset::{Glob, GlobSet, GlobSetBuilder};
27use ignore::WalkBuilder;
28use serde::{Deserialize, Serialize};
29use uuid::Uuid;
30
31use sloc_config::{
32    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
33    FailureBehavior, MixedLinePolicy,
34};
35use sloc_languages::style::IndentStyle;
36use sloc_languages::{
37    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
38    RawLineCounts, StyleAnalysis, StyleLangScope,
39};
40
41// ── Detection sample sizes and thresholds ────────────────────────────────────
42
43/// Maximum number of worker threads used for parallel file analysis.
44const MAX_ANALYSIS_THREADS: usize = 16;
45/// Fallback thread count when `available_parallelism` is unavailable.
46const DEFAULT_ANALYSIS_THREADS: usize = 4;
47/// Byte sample used to detect `@generated` markers.
48const GENERATED_SAMPLE_BYTES: usize = 1024;
49/// Byte sample used to detect minified files via line-length heuristic.
50const MINIFIED_SAMPLE_BYTES: usize = 4096;
51/// Longest line length above which a file is considered minified.
52const MINIFIED_LINE_THRESHOLD: usize = 2000;
53/// Byte sample used to detect binary files via null-byte scan.
54const BINARY_SAMPLE_BYTES: usize = 8192;
55
56/// Atomics shared between `analyze()` and the caller so the caller can poll scan progress.
57pub struct ProgressCounters {
58    /// Number of candidate files processed so far (incremented per file, across all threads).
59    pub files_done: Arc<AtomicUsize>,
60    /// Total candidate files discovered (set before parallel analysis begins).
61    pub files_total: Arc<AtomicUsize>,
62}
63
64/// Three-way outcome for metadata-level policy checks.
65enum MetadataPolicyOutcome {
66    /// Skip this file — include the record in output.
67    Skip(Box<FileRecord>),
68    /// Exclude this file entirely — no record in output (include-glob miss).
69    Exclude,
70    /// Continue to content checks.
71    Continue,
72}
73
74#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum FileStatus {
77    AnalyzedExact,
78    AnalyzedBestEffort,
79    SkippedBinary,
80    SkippedDecodeError,
81    SkippedUnsupported,
82    SkippedByPolicy,
83    ErrorInternal,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize, Default)]
87pub struct EffectiveCounts {
88    pub code_lines: u64,
89    pub comment_lines: u64,
90    pub blank_lines: u64,
91    pub mixed_lines_separate: u64,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ToolMetadata {
96    pub name: String,
97    pub version: String,
98    pub run_id: String,
99    pub timestamp_utc: DateTime<Utc>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct EnvironmentMetadata {
104    pub operating_system: String,
105    pub architecture: String,
106    pub runtime_mode: String,
107    pub initiator_username: String,
108    pub initiator_hostname: String,
109    /// CI system name when the scan runs inside a known CI environment (Jenkins,
110    /// GitHub Actions, GitLab CI, …). `None` for interactive / local runs.
111    #[serde(default, skip_serializing_if = "Option::is_none")]
112    pub ci_name: Option<String>,
113}
114
115#[derive(Debug, Clone, Serialize, Deserialize, Default)]
116pub struct SummaryTotals {
117    pub files_considered: u64,
118    pub files_analyzed: u64,
119    pub files_skipped: u64,
120    pub total_physical_lines: u64,
121    pub code_lines: u64,
122    pub comment_lines: u64,
123    pub blank_lines: u64,
124    pub mixed_lines_separate: u64,
125    #[serde(default)]
126    pub functions: u64,
127    #[serde(default)]
128    pub classes: u64,
129    #[serde(default)]
130    pub variables: u64,
131    #[serde(default)]
132    pub imports: u64,
133    #[serde(default)]
134    pub test_count: u64,
135    /// Lexically detected test assertion call lines across all analyzed files.
136    #[serde(default)]
137    pub test_assertion_count: u64,
138    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
139    #[serde(default)]
140    pub test_suite_count: u64,
141    /// Aggregated from LCOV data when provided.
142    #[serde(default)]
143    pub coverage_lines_found: u64,
144    #[serde(default)]
145    pub coverage_lines_hit: u64,
146    #[serde(default)]
147    pub coverage_functions_found: u64,
148    #[serde(default)]
149    pub coverage_functions_hit: u64,
150    #[serde(default)]
151    pub coverage_branches_found: u64,
152    #[serde(default)]
153    pub coverage_branches_hit: u64,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct LanguageSummary {
158    pub language: Language,
159    pub files: u64,
160    pub total_physical_lines: u64,
161    pub code_lines: u64,
162    pub comment_lines: u64,
163    pub blank_lines: u64,
164    pub mixed_lines_separate: u64,
165    #[serde(default)]
166    pub functions: u64,
167    #[serde(default)]
168    pub classes: u64,
169    #[serde(default)]
170    pub variables: u64,
171    #[serde(default)]
172    pub imports: u64,
173    #[serde(default)]
174    pub test_count: u64,
175    #[serde(default)]
176    pub test_assertion_count: u64,
177    #[serde(default)]
178    pub test_suite_count: u64,
179    #[serde(default)]
180    pub coverage_lines_found: u64,
181    #[serde(default)]
182    pub coverage_lines_hit: u64,
183    #[serde(default)]
184    pub coverage_functions_found: u64,
185    #[serde(default)]
186    pub coverage_functions_hit: u64,
187    #[serde(default)]
188    pub coverage_branches_found: u64,
189    #[serde(default)]
190    pub coverage_branches_hit: u64,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct FileRecord {
195    pub path: String,
196    pub relative_path: String,
197    pub language: Option<Language>,
198    pub size_bytes: u64,
199    pub detected_encoding: Option<String>,
200    pub raw_line_categories: RawLineCounts,
201    pub effective_counts: EffectiveCounts,
202    pub status: FileStatus,
203    pub warnings: Vec<String>,
204    pub generated: bool,
205    pub minified: bool,
206    pub vendor: bool,
207    pub parse_mode: Option<ParseMode>,
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub submodule: Option<String>,
210    /// Line/function/branch coverage from an external LCOV file, when provided.
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub coverage: Option<FileCoverage>,
213    /// Lexical style-guide adherence analysis; `None` for unsupported languages.
214    #[serde(default, skip_serializing_if = "Option::is_none")]
215    pub style_analysis: Option<StyleAnalysis>,
216}
217
218/// Per-language-family style aggregation within a `StyleSummary`.
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct LanguageStyleGroup {
221    /// Display label, e.g. `"C / C++"`, `"Python"`, `"JavaScript"`.
222    pub language_family: String,
223    /// Number of files in this group.
224    pub files_count: u32,
225    /// Name of the guide with the highest average adherence.
226    pub dominant_guide: String,
227    /// Average adherence of the dominant guide (0–100).
228    pub dominant_score_pct: u8,
229    /// Most common indent style across the group.
230    pub common_indent_style: String,
231    /// Average guide adherence scores (guide name, 0–100) sorted descending.
232    pub guide_avg_scores: Vec<(String, u8)>,
233    /// Percentage of files (0–100) where ≤ 5 % of lines exceed the configured column threshold.
234    pub line80_compliant_pct: u8,
235    /// Same as `line80_compliant_pct` but named for the actual configured threshold.
236    pub line_col_compliant_pct: u8,
237}
238
239/// Aggregate multi-language style-guide adherence across all analysed files.
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct StyleSummary {
242    /// Total files for which style data was produced.
243    pub files_analyzed: u32,
244    /// Most common indent style across *all* analysed files.
245    pub common_indent_style: String,
246    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over 80 chars (legacy, always 80).
247    pub line80_compliant_pct: u8,
248    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over `col_threshold` chars.
249    pub line_col_compliant_pct: u8,
250    /// Column-width threshold used for `line_col_compliant_pct` (from `analysis.style_col_threshold`).
251    pub col_threshold: u16,
252    /// Per-language-family breakdown, sorted by `files_count` descending.
253    pub by_language: Vec<LanguageStyleGroup>,
254}
255
256/// Backward-compatible alias kept so that `sloc-report` and `sloc-web` can migrate
257/// incrementally without a breaking change on the same release.
258pub type CppStyleSummary = StyleSummary;
259
260/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
261#[derive(Debug, Clone, Serialize, Deserialize)]
262pub struct SubmoduleSummary {
263    pub name: String,
264    pub relative_path: String,
265    pub files_analyzed: u64,
266    pub total_physical_lines: u64,
267    pub code_lines: u64,
268    pub comment_lines: u64,
269    pub blank_lines: u64,
270    pub language_summaries: Vec<LanguageSummary>,
271    /// Short commit SHA (7 chars) of the submodule's own HEAD at scan time.
272    #[serde(default, skip_serializing_if = "Option::is_none")]
273    pub git_commit_short: Option<String>,
274    /// Full commit SHA of the submodule's own HEAD at scan time.
275    #[serde(default, skip_serializing_if = "Option::is_none")]
276    pub git_commit_long: Option<String>,
277    /// Branch name active in the submodule at scan time.
278    #[serde(default, skip_serializing_if = "Option::is_none")]
279    pub git_branch: Option<String>,
280    /// Author of the submodule's most recent commit at scan time.
281    #[serde(default, skip_serializing_if = "Option::is_none")]
282    pub git_commit_author: Option<String>,
283    /// ISO 8601 author-date of the submodule's most recent commit.
284    #[serde(default, skip_serializing_if = "Option::is_none")]
285    pub git_commit_date: Option<String>,
286    /// URL of the submodule's `origin` remote as recorded in its `.git/config`.
287    #[serde(default, skip_serializing_if = "Option::is_none")]
288    pub git_remote_url: Option<String>,
289}
290
291#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct AnalysisRun {
293    pub tool: ToolMetadata,
294    pub environment: EnvironmentMetadata,
295    pub effective_configuration: AppConfig,
296    pub input_roots: Vec<String>,
297    pub summary_totals: SummaryTotals,
298    pub totals_by_language: Vec<LanguageSummary>,
299    pub per_file_records: Vec<FileRecord>,
300    pub skipped_file_records: Vec<FileRecord>,
301    pub warnings: Vec<String>,
302    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
303    #[serde(default, skip_serializing_if = "Vec::is_empty")]
304    pub submodule_summaries: Vec<SubmoduleSummary>,
305    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
306    #[serde(default, skip_serializing_if = "Option::is_none")]
307    pub git_commit_short: Option<String>,
308    /// Full git commit SHA at scan time, if the project is a git repo.
309    #[serde(default, skip_serializing_if = "Option::is_none")]
310    pub git_commit_long: Option<String>,
311    /// Git branch active at scan time, if the project is a git repo.
312    #[serde(default, skip_serializing_if = "Option::is_none")]
313    pub git_branch: Option<String>,
314    /// Author of the last git commit at scan time.
315    #[serde(default, skip_serializing_if = "Option::is_none")]
316    pub git_commit_author: Option<String>,
317    /// Comma-separated git tags pointing at HEAD at scan time.
318    #[serde(default, skip_serializing_if = "Option::is_none")]
319    pub git_tags: Option<String>,
320    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
321    #[serde(default, skip_serializing_if = "Option::is_none")]
322    pub git_nearest_tag: Option<String>,
323    /// ISO 8601 author-date of the last git commit at scan time.
324    #[serde(default, skip_serializing_if = "Option::is_none")]
325    pub git_commit_date: Option<String>,
326    /// URL of the `origin` remote as recorded in `.git/config` at scan time.
327    #[serde(default, skip_serializing_if = "Option::is_none")]
328    pub git_remote_url: Option<String>,
329    /// Multi-language style-guide adherence; `None` when no supported files were analysed.
330    #[serde(default, skip_serializing_if = "Option::is_none")]
331    pub style_summary: Option<StyleSummary>,
332}
333
334#[derive(Default)]
335struct GitInfo {
336    commit_short: Option<String>,
337    commit_long: Option<String>,
338    branch: Option<String>,
339    author: Option<String>,
340    tags: Option<String>,
341    nearest_tag: Option<String>,
342    commit_date: Option<String>,
343    remote_url: Option<String>,
344}
345
346/// Locate the `.git` directory by walking up from `start`.
347/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
348/// submodules. Returns `None` if no git repo is found.
349fn find_git_dir(start: &Path) -> Option<PathBuf> {
350    let mut current = Some(start);
351    while let Some(dir) = current {
352        let candidate = dir.join(".git");
353        if candidate.is_dir() {
354            return Some(candidate);
355        }
356        if candidate.is_file() {
357            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
358                return Some(resolved);
359            }
360        }
361        current = dir.parent();
362    }
363    None
364}
365
366/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
367/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
368/// or if the resolved path is not an existing directory.
369fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
370    let content = fs::read_to_string(file).ok()?;
371    let ptr = content.trim().strip_prefix("gitdir: ")?;
372    // Normalise forward-slash paths to the OS separator so that Path operations
373    // (join, exists, canonicalize) work correctly on Windows.
374    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
375    let resolved = if Path::new(&ptr_native).is_absolute() {
376        PathBuf::from(&ptr_native)
377    } else {
378        base_dir.join(&ptr_native)
379    };
380    // canonicalize resolves ".." components and symlinks; fall back to the
381    // un-canonicalized path if it fails (e.g. some Windows configurations
382    // return a UNC "\\?\" prefix that confuses later path operations).
383    let final_path = resolved.canonicalize().unwrap_or(resolved);
384    if final_path.is_dir() {
385        Some(final_path)
386    } else {
387        None
388    }
389}
390
391/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
392/// Checks loose ref files first, then `packed-refs`.
393fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
394    // Build the OS-native path to the loose ref file by joining each
395    // forward-slash component individually.  This produces the correct
396    // separator on every platform without any manual replacement.
397    let ref_path = refname
398        .split('/')
399        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
400    if ref_path.exists() {
401        let sha = fs::read_to_string(&ref_path)
402            .ok()
403            .map(|s| s.trim().to_string())
404            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
405        if sha.is_some() {
406            return sha;
407        }
408    }
409    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
410    // comments; lines starting with '^' are peeled tag objects to skip).
411    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
412    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
413    for line in packed.lines() {
414        if line.starts_with('#') || line.starts_with('^') {
415            continue;
416        }
417        let mut cols = line.splitn(2, ' ');
418        let sha = cols.next()?;
419        let name = cols.next()?.trim();
420        if name == refname {
421            return Some(sha.to_string());
422        }
423    }
424    None
425}
426
427/// Extract the URL value from a `url = <value>` git-config line, returning `None` if absent or empty.
428fn parse_url_line(line: &str) -> Option<&str> {
429    let rest = line.strip_prefix("url")?;
430    let rest = rest.trim_start_matches([' ', '\t']);
431    let url = rest.strip_prefix('=')?.trim();
432    if url.is_empty() {
433        None
434    } else {
435        Some(url)
436    }
437}
438
439/// Parse `.git/config` and return the URL of the `origin` remote, if present.
440fn read_git_remote_url(git_dir: &Path) -> Option<String> {
441    let config = fs::read_to_string(git_dir.join("config")).ok()?;
442    let mut in_origin = false;
443    for line in config.lines() {
444        let trimmed = line.trim();
445        if trimmed.starts_with('[') {
446            in_origin = trimmed == r#"[remote "origin"]"#;
447        } else if in_origin {
448            if let Some(url) = parse_url_line(trimmed) {
449                return Some(url.to_owned());
450            }
451        }
452    }
453    None
454}
455
456/// Detect git metadata by reading `.git/` files directly — no `git` executable
457/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
458/// reflogs.
459fn detect_git_for_run(project_path: &Path) -> GitInfo {
460    // Resolve the CI branch early so it can fill in any gap in git metadata.
461    let ci_branch = ci_branch_from_env();
462
463    let Some(git_dir) = find_git_dir(project_path) else {
464        // No .git directory (e.g. scanning a non-repo path in CI). Use whatever
465        // the CI system tells us about the branch.
466        return GitInfo {
467            branch: ci_branch,
468            ..GitInfo::default()
469        };
470    };
471
472    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
473        Ok(s) => s.trim().to_string(),
474        Err(_) => {
475            return GitInfo {
476                branch: ci_branch,
477                ..GitInfo::default()
478            }
479        }
480    };
481
482    let (branch_from_head, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
483        || {
484            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
485                // Detached HEAD — HEAD file is the commit SHA (common in CI checkouts).
486                (None, Some(head_raw[..40].to_string()))
487            } else {
488                (None, None)
489            }
490        },
491        |refname| {
492            let branch = refname
493                .strip_prefix("refs/heads/")
494                .map(|b| b.trim().to_string());
495            let sha = resolve_ref(&git_dir, refname.trim());
496            (branch, sha)
497        },
498    );
499    // Prefer the branch name derived from the HEAD ref; fall back to the CI
500    // env var (covers detached-HEAD checkouts done by Jenkins, GitHub Actions, etc.).
501    let branch = branch_from_head.or(ci_branch);
502
503    let commit_short = commit_long
504        .as_deref()
505        .map(|s| s.chars().take(7).collect::<String>());
506
507    let author = run_git_cmd(project_path, &["log", "-1", "--format=%an", "HEAD"]);
508    let commit_date = run_git_cmd(project_path, &["log", "-1", "--format=%aI", "HEAD"]);
509    let remote_url = read_git_remote_url(&git_dir);
510
511    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
512    // but don't block on it. If git isn't available these will simply be None.
513    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
514        t.lines()
515            .filter(|l| !l.is_empty())
516            .collect::<Vec<_>>()
517            .join(", ")
518    });
519    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
520
521    GitInfo {
522        commit_short,
523        commit_long,
524        branch,
525        author,
526        tags,
527        nearest_tag,
528        commit_date,
529        remote_url,
530    }
531}
532
533/// Run a git command as a best-effort supplemental source.
534fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
535    // Try the bare name first (works when git is on PATH), then fall back to
536    // absolute paths for service accounts that run with a stripped PATH.
537    // Unix paths silently fail on Windows and vice-versa.
538    let candidates: &[&str] = &[
539        // Works on all platforms when git is on PATH
540        "git",
541        // Common Linux / macOS install locations
542        "/usr/bin/git",
543        "/usr/local/bin/git",
544        "/opt/homebrew/bin/git",
545        // Git for Windows default installation paths
546        r"C:\Program Files\Git\cmd\git.exe",
547        r"C:\Program Files\Git\bin\git.exe",
548        r"C:\Program Files (x86)\Git\cmd\git.exe",
549    ];
550    for &exe in candidates {
551        let result = std::process::Command::new(exe)
552            .args(["-c", "safe.directory=*"])
553            .args(args)
554            .current_dir(dir)
555            .output()
556            .ok()
557            .filter(|o| o.status.success())
558            .and_then(|o| String::from_utf8(o.stdout).ok())
559            .map(|s| s.trim().to_string())
560            .filter(|s| !s.is_empty());
561        if result.is_some() {
562            return result;
563        }
564    }
565    None
566}
567
568/// Return the name of the CI system if the process is running inside one.
569fn detect_ci_system() -> Option<&'static str> {
570    let ev = |k: &str| std::env::var(k).is_ok();
571    let ev_true = |k: &str| std::env::var(k).as_deref() == Ok("true");
572    if ev("JENKINS_URL") || ev("JENKINS_HOME") || ev("BUILD_URL") {
573        return Some("Jenkins");
574    }
575    if ev_true("GITHUB_ACTIONS") {
576        return Some("GitHub Actions");
577    }
578    if ev_true("GITLAB_CI") {
579        return Some("GitLab CI");
580    }
581    if ev_true("CIRCLECI") {
582        return Some("CircleCI");
583    }
584    if ev_true("TRAVIS") {
585        return Some("Travis CI");
586    }
587    if ev_true("TF_BUILD") {
588        return Some("Azure DevOps");
589    }
590    if ev("TEAMCITY_VERSION") {
591        return Some("TeamCity");
592    }
593    None
594}
595
596/// Read the current branch name from well-known CI environment variables.
597/// Called as a fallback when the git HEAD is detached (common in CI checkouts).
598fn ci_branch_from_env() -> Option<String> {
599    const VARS: &[&str] = &[
600        "BRANCH_NAME",        // Jenkins Pipeline
601        "GIT_BRANCH",         // Jenkins Freestyle (may carry "origin/<branch>")
602        "GITHUB_REF_NAME",    // GitHub Actions
603        "CI_COMMIT_BRANCH",   // GitLab CI
604        "CIRCLE_BRANCH",      // CircleCI
605        "TRAVIS_BRANCH",      // Travis CI
606        "BUILD_SOURCEBRANCH", // Azure DevOps (may carry "refs/heads/<branch>")
607    ];
608    for &var in VARS {
609        if let Ok(val) = std::env::var(var) {
610            let val = val.trim();
611            let val = val
612                .strip_prefix("refs/heads/")
613                .or_else(|| val.strip_prefix("origin/"))
614                .unwrap_or(val);
615            if !val.is_empty() && val != "HEAD" {
616                return Some(val.to_string());
617            }
618        }
619    }
620    None
621}
622
623fn get_current_username() -> String {
624    std::env::var("USERNAME")
625        .or_else(|_| std::env::var("USER"))
626        .unwrap_or_else(|_| "unknown".to_string())
627}
628
629fn non_empty_env(var: &str) -> Option<String> {
630    let v = std::env::var(var).ok()?;
631    if v.is_empty() {
632        None
633    } else {
634        Some(v)
635    }
636}
637
638fn is_jenkins_env() -> bool {
639    std::env::var("JENKINS_URL").is_ok()
640        || std::env::var("JENKINS_HOME").is_ok()
641        || std::env::var("BUILD_URL").is_ok()
642}
643
644fn get_hostname() -> String {
645    // In CI environments prefer a human-readable agent/runner identifier over
646    // whatever hostname the container was assigned.
647    if is_jenkins_env() {
648        if let Some(n) = non_empty_env("NODE_NAME") {
649            return n;
650        }
651    }
652    if std::env::var("GITHUB_ACTIONS").as_deref() == Ok("true") {
653        if let Some(r) = non_empty_env("RUNNER_NAME") {
654            return r;
655        }
656    }
657    if std::env::var("GITLAB_CI").as_deref() == Ok("true") {
658        if let Some(r) = non_empty_env("CI_RUNNER_DESCRIPTION") {
659            return r;
660        }
661    }
662    std::env::var("COMPUTERNAME")
663        .or_else(|_| std::env::var("HOSTNAME"))
664        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
665        .unwrap_or_else(|_| "unknown".to_string())
666}
667
668/// Walk a single directory root and collect file records into the output vectors.
669#[allow(clippy::too_many_arguments)]
670fn walk_root(
671    root: &Path,
672    config: &AppConfig,
673    include_globs: Option<&GlobSet>,
674    exclude_globs: Option<&GlobSet>,
675    enabled_languages: Option<&BTreeSet<Language>>,
676    seen_paths: &mut HashSet<PathBuf>,
677    analyzed: &mut Vec<FileRecord>,
678    skipped: &mut Vec<FileRecord>,
679    warnings: &mut Vec<String>,
680    cancel: Option<&AtomicBool>,
681    progress: Option<&ProgressCounters>,
682) -> Result<()> {
683    let mut builder = WalkBuilder::new(root);
684    builder
685        .follow_links(config.discovery.follow_symlinks)
686        .hidden(config.discovery.ignore_hidden_files)
687        .ignore(config.discovery.honor_ignore_files)
688        .parents(config.discovery.honor_ignore_files)
689        .git_ignore(config.discovery.honor_ignore_files)
690        .git_global(config.discovery.honor_ignore_files)
691        .git_exclude(config.discovery.honor_ignore_files);
692
693    let paths = collect_walk_paths(&builder, seen_paths, warnings);
694    if paths.is_empty() {
695        return Ok(());
696    }
697
698    if let Some(p) = progress {
699        p.files_total.fetch_add(paths.len(), Ordering::Relaxed);
700    }
701
702    let chunk_results = run_parallel_analysis(
703        &paths,
704        root,
705        config,
706        include_globs,
707        exclude_globs,
708        enabled_languages,
709        cancel,
710        progress,
711    )?;
712    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
713}
714
715fn collect_walk_paths(
716    builder: &WalkBuilder,
717    seen_paths: &mut HashSet<PathBuf>,
718    warnings: &mut Vec<String>,
719) -> Vec<PathBuf> {
720    // build_parallel() walks the directory tree across multiple threads (work-stealing
721    // internally), which is meaningfully faster for deeply nested repos with many directories.
722    // We collect results via an MPSC channel so each walker thread sends without contention.
723    let (tx, rx) = std::sync::mpsc::channel::<std::result::Result<PathBuf, String>>();
724
725    builder.build_parallel().run(|| {
726        let tx = tx.clone();
727        Box::new(move |entry| {
728            match entry {
729                Err(e) => {
730                    let _ = tx.send(Err(format!("discovery warning: {e}")));
731                }
732                Ok(e) => {
733                    let path = e.into_path();
734                    if !path.is_dir() {
735                        let _ = tx.send(Ok(path));
736                    }
737                }
738            }
739            ignore::WalkState::Continue
740        })
741    });
742
743    // Drop the sender that the outer scope holds; the per-thread clones were dropped when
744    // run() returned (all threads finished). Dropping this last sender closes the channel.
745    drop(tx);
746
747    rx.into_iter()
748        .filter_map(|msg| match msg {
749            Ok(path) => {
750                if seen_paths.insert(path.clone()) {
751                    Some(path)
752                } else {
753                    None
754                }
755            }
756            Err(warn) => {
757                warnings.push(warn);
758                None
759            }
760        })
761        .collect()
762}
763
764/// Inner work loop executed by each analysis thread.
765#[allow(clippy::too_many_arguments)]
766fn worker_loop(
767    paths: &[PathBuf],
768    root: &Path,
769    config: &AppConfig,
770    include_globs: Option<&GlobSet>,
771    exclude_globs: Option<&GlobSet>,
772    enabled_languages: Option<&BTreeSet<Language>>,
773    cancel: Option<&AtomicBool>,
774    next_index: &AtomicUsize,
775    files_done: Option<&AtomicUsize>,
776) -> Vec<Result<Option<FileRecord>>> {
777    let mut results = Vec::new();
778    loop {
779        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
780            results.push(Err(anyhow::anyhow!("analysis cancelled")));
781            break;
782        }
783        let i = next_index.fetch_add(1, Ordering::Relaxed);
784        if i >= paths.len() {
785            break;
786        }
787        results.push(analyze_candidate_file(
788            &paths[i],
789            root,
790            config,
791            include_globs,
792            exclude_globs,
793            enabled_languages,
794        ));
795        if let Some(fd) = files_done {
796            fd.fetch_add(1, Ordering::Relaxed);
797        }
798    }
799    results
800}
801
802#[allow(clippy::too_many_arguments)]
803fn run_parallel_analysis(
804    paths: &[PathBuf],
805    root: &Path,
806    config: &AppConfig,
807    include_globs: Option<&GlobSet>,
808    exclude_globs: Option<&GlobSet>,
809    enabled_languages: Option<&BTreeSet<Language>>,
810    cancel: Option<&AtomicBool>,
811    progress: Option<&ProgressCounters>,
812) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
813    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
814        n.get().min(MAX_ANALYSIS_THREADS)
815    });
816    // Shared work-queue index: each thread atomically claims the next path to process.
817    // This eliminates static-chunk load imbalance — threads that finish early immediately
818    // pick up more work instead of sitting idle while one overloaded chunk finishes.
819    let next_index = AtomicUsize::new(0);
820    let files_done: Option<&AtomicUsize> = progress.map(|p| p.files_done.as_ref());
821
822    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
823        // IMPORTANT: collect ALL handles before joining any of them.
824        // A lazy spawn-then-join chain would serialize threads one at a time.
825        let mut handles = Vec::with_capacity(thread_count);
826        for _ in 0..thread_count {
827            handles.push(s.spawn(|| {
828                worker_loop(
829                    paths,
830                    root,
831                    config,
832                    include_globs,
833                    exclude_globs,
834                    enabled_languages,
835                    cancel,
836                    &next_index,
837                    files_done,
838                )
839            }));
840        }
841        handles
842            .into_iter()
843            .map(|h| {
844                h.join()
845                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
846            })
847            .collect()
848    })
849}
850
851fn merge_chunk_results(
852    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
853    analyzed: &mut Vec<FileRecord>,
854    skipped: &mut Vec<FileRecord>,
855    warnings: &mut Vec<String>,
856) -> Result<()> {
857    for chunk in chunk_results {
858        for result in chunk {
859            if let Some(record) = result? {
860                push_record(record, analyzed, skipped, warnings);
861            }
862        }
863    }
864    Ok(())
865}
866
867/// Label each analyzed file with its submodule and build per-submodule summaries.
868fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
869    let root = config.discovery.root_paths[0]
870        .canonicalize()
871        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
872    let submodules = detect_submodules(&root);
873    if submodules.is_empty() {
874        return Vec::new();
875    }
876
877    for file in analyzed.iter_mut() {
878        for (name, sub_path) in &submodules {
879            let prefix = sub_path.to_string_lossy().replace('\\', "/");
880            let rel = &file.relative_path;
881            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
882                file.submodule = Some(name.clone());
883                break;
884            }
885        }
886    }
887
888    build_submodule_summaries(analyzed, &submodules, &root)
889}
890
891/// Assemble the final `AnalysisRun` from collected records and metadata.
892fn assemble_run(
893    config: &AppConfig,
894    runtime_mode: &str,
895    analyzed: Vec<FileRecord>,
896    skipped: Vec<FileRecord>,
897    warnings: Vec<String>,
898    submodule_summaries: Vec<SubmoduleSummary>,
899) -> AnalysisRun {
900    let summary = build_summary(&analyzed, &skipped);
901    let language_summaries = build_language_summaries(&analyzed);
902    let col_threshold = config.analysis.style_col_threshold;
903    let style_summary = build_style_summary(&analyzed, col_threshold);
904
905    let first_root = config
906        .discovery
907        .root_paths
908        .first()
909        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
910    let git = first_root
911        .as_deref()
912        .map(detect_git_for_run)
913        .unwrap_or_default();
914
915    let now = Utc::now();
916    let run_id = {
917        let uuid_suffix = Uuid::new_v4().simple().to_string();
918        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
919    };
920
921    AnalysisRun {
922        tool: ToolMetadata {
923            name: "sloc".into(),
924            version: env!("CARGO_PKG_VERSION").into(),
925            run_id,
926            timestamp_utc: now,
927        },
928        environment: EnvironmentMetadata {
929            operating_system: std::env::consts::OS.into(),
930            architecture: std::env::consts::ARCH.into(),
931            runtime_mode: runtime_mode.into(),
932            initiator_username: get_current_username(),
933            initiator_hostname: get_hostname(),
934            ci_name: if is_jenkins_env() {
935                Some(format!("Jenkins\t{}", get_hostname()))
936            } else {
937                detect_ci_system().map(str::to_string)
938            },
939        },
940        effective_configuration: config.clone(),
941        input_roots: config
942            .discovery
943            .root_paths
944            .iter()
945            .map(|p| path_to_string(p))
946            .collect(),
947        summary_totals: summary,
948        totals_by_language: language_summaries,
949        per_file_records: analyzed,
950        skipped_file_records: skipped,
951        warnings,
952        submodule_summaries,
953        git_commit_short: git.commit_short,
954        git_commit_long: git.commit_long,
955        git_branch: git.branch,
956        git_commit_author: git.author,
957        git_tags: git.tags,
958        git_nearest_tag: git.nearest_tag,
959        git_commit_date: git.commit_date,
960        git_remote_url: git.remote_url,
961        style_summary,
962    }
963}
964
965/// # Errors
966///
967/// Returns an error if the config is invalid, root paths cannot be walked, or any file
968/// analysis step fails in a way that cannot be recovered from.
969#[allow(clippy::too_many_lines)]
970pub fn analyze(
971    config: &AppConfig,
972    runtime_mode: &str,
973    cancel: Option<&AtomicBool>,
974    progress: Option<&ProgressCounters>,
975) -> Result<AnalysisRun> {
976    config.validate()?;
977
978    if config.discovery.root_paths.is_empty() {
979        anyhow::bail!("no input paths were provided");
980    }
981
982    let include_globs = compile_globset(&config.discovery.include_globs)?;
983    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
984    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
985
986    let mut analyzed = Vec::new();
987    let mut skipped = Vec::new();
988    let mut warnings = Vec::new();
989    let mut seen_paths = HashSet::new();
990
991    for root in &config.discovery.root_paths {
992        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
993            anyhow::bail!("analysis cancelled");
994        }
995
996        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
997
998        if root.is_file() {
999            if let Some(record) = analyze_candidate_file(
1000                &root,
1001                root.parent().unwrap_or_else(|| Path::new(".")),
1002                config,
1003                include_globs.as_ref(),
1004                exclude_globs.as_ref(),
1005                enabled_languages.as_ref(),
1006            )? {
1007                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
1008            }
1009            continue;
1010        }
1011
1012        walk_root(
1013            &root,
1014            config,
1015            include_globs.as_ref(),
1016            exclude_globs.as_ref(),
1017            enabled_languages.as_ref(),
1018            &mut seen_paths,
1019            &mut analyzed,
1020            &mut skipped,
1021            &mut warnings,
1022            cancel,
1023            progress,
1024        )?;
1025    }
1026
1027    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1028    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1029
1030    // Submodule detection: label each file with its submodule and build per-submodule summaries.
1031    let submodule_summaries = if config.discovery.submodule_breakdown {
1032        process_submodules(config, &mut analyzed)
1033    } else {
1034        Vec::new()
1035    };
1036
1037    attach_coverage(config, &mut analyzed, &mut warnings);
1038
1039    Ok(assemble_run(
1040        config,
1041        runtime_mode,
1042        analyzed,
1043        skipped,
1044        warnings,
1045        submodule_summaries,
1046    ))
1047}
1048
1049fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
1050    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
1051    else {
1052        return;
1053    };
1054    tracing::debug!(path = %cov_path.display(), "loading coverage file");
1055    match fs::read_to_string(&cov_path) {
1056        Ok(content) => {
1057            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
1058            let mut matched: u32 = 0;
1059            let mut unmatched: u32 = 0;
1060            for record in analyzed.iter_mut() {
1061                record.coverage =
1062                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
1063                if record.coverage.is_some() {
1064                    matched += 1;
1065                } else {
1066                    unmatched += 1;
1067                }
1068            }
1069            tracing::debug!(
1070                path = %cov_path.display(),
1071                coverage_entries = cov_map.len(),
1072                files_matched = matched,
1073                files_unmatched = unmatched,
1074                "coverage attached"
1075            );
1076            if unmatched > 0 && matched == 0 {
1077                tracing::warn!(
1078                    path = %cov_path.display(),
1079                    "coverage file loaded but no source files could be matched — check that paths in the coverage report match the scanned directory"
1080                );
1081            }
1082        }
1083        Err(e) => {
1084            tracing::warn!(path = %cov_path.display(), error = %e, "coverage file could not be read");
1085            warnings.push(format!(
1086                "coverage file '{}' could not be read: {e}",
1087                cov_path.display()
1088            ));
1089        }
1090    }
1091}
1092
1093fn push_record(
1094    record: FileRecord,
1095    analyzed: &mut Vec<FileRecord>,
1096    skipped: &mut Vec<FileRecord>,
1097    warnings: &mut Vec<String>,
1098) {
1099    warnings.extend(
1100        record
1101            .warnings
1102            .iter()
1103            .map(|warning| format!("{}: {warning}", record.relative_path)),
1104    );
1105
1106    match record.status {
1107        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
1108        _ => skipped.push(record),
1109    }
1110}
1111
1112/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
1113#[inline]
1114fn skip_with_reason(
1115    path: &Path,
1116    root: &Path,
1117    size: u64,
1118    reason: impl Into<String>,
1119) -> MetadataPolicyOutcome {
1120    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
1121        path,
1122        root,
1123        size,
1124        FileStatus::SkippedByPolicy,
1125        vec![reason.into()],
1126    )))
1127}
1128
1129/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
1130/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
1131/// or `Continue` to proceed to content checks.
1132#[allow(clippy::too_many_arguments)]
1133fn check_metadata_policy(
1134    path: &Path,
1135    root: &Path,
1136    relative_path: &str,
1137    metadata: &fs::Metadata,
1138    config: &AppConfig,
1139    include_globs: Option<&GlobSet>,
1140    exclude_globs: Option<&GlobSet>,
1141) -> MetadataPolicyOutcome {
1142    let size = metadata.len();
1143
1144    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
1145        return skip_with_reason(path, root, size, "symlink skipped by policy");
1146    }
1147    if file_name_eq(path, ".gitignore") {
1148        return skip_with_reason(path, root, size, ".gitignore is always excluded");
1149    }
1150    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
1151        return skip_with_reason(path, root, size, "path matched excluded directory setting");
1152    }
1153    if size > config.discovery.max_file_size_bytes {
1154        return skip_with_reason(
1155            path,
1156            root,
1157            size,
1158            format!(
1159                "file exceeded max_file_size_bytes ({})",
1160                config.discovery.max_file_size_bytes
1161            ),
1162        );
1163    }
1164    if let Some(globs) = include_globs {
1165        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
1166            return MetadataPolicyOutcome::Exclude;
1167        }
1168    }
1169    if let Some(globs) = exclude_globs {
1170        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
1171            return skip_with_reason(path, root, size, "path matched exclude glob");
1172        }
1173    }
1174    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
1175        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
1176    }
1177
1178    MetadataPolicyOutcome::Continue
1179}
1180
1181struct ContentPolicyResult {
1182    vendor: bool,
1183    generated: bool,
1184    minified: bool,
1185    skip_record: Option<FileRecord>,
1186}
1187
1188/// Apply content-level policy checks (vendor, generated, minified).
1189/// `skip_record` is `Some` when the file should be skipped.
1190fn check_content_policy(
1191    path: &Path,
1192    root: &Path,
1193    size_bytes: u64,
1194    bytes: &[u8],
1195    config: &AppConfig,
1196) -> ContentPolicyResult {
1197    let vendor = is_vendor_path(path);
1198    if vendor && config.analysis.vendor_directory_detection {
1199        return ContentPolicyResult {
1200            vendor,
1201            generated: false,
1202            minified: false,
1203            skip_record: Some(skipped_record(
1204                path,
1205                root,
1206                size_bytes,
1207                FileStatus::SkippedByPolicy,
1208                vec!["vendor file skipped by policy".into()],
1209            )),
1210        };
1211    }
1212
1213    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
1214    if generated {
1215        return ContentPolicyResult {
1216            vendor,
1217            generated,
1218            minified: false,
1219            skip_record: Some(skipped_record(
1220                path,
1221                root,
1222                size_bytes,
1223                FileStatus::SkippedByPolicy,
1224                vec!["generated file skipped by policy".into()],
1225            )),
1226        };
1227    }
1228
1229    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
1230    if minified {
1231        return ContentPolicyResult {
1232            vendor,
1233            generated,
1234            minified,
1235            skip_record: Some(skipped_record(
1236                path,
1237                root,
1238                size_bytes,
1239                FileStatus::SkippedByPolicy,
1240                vec!["minified file skipped by policy".into()],
1241            )),
1242        };
1243    }
1244
1245    ContentPolicyResult {
1246        vendor,
1247        generated,
1248        minified,
1249        skip_record: None,
1250    }
1251}
1252
1253/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
1254fn decode_file_contents(
1255    path: &Path,
1256    root: &Path,
1257    size_bytes: u64,
1258    bytes: &[u8],
1259    config: &AppConfig,
1260) -> Result<Option<(String, String, Vec<String>)>> {
1261    if is_binary(bytes) {
1262        return match config.analysis.binary_file_behavior {
1263            BinaryFileBehavior::Skip => Ok(None),
1264            BinaryFileBehavior::Fail => {
1265                anyhow::bail!("binary file encountered: {}", path.display())
1266            }
1267        };
1268    }
1269
1270    match decode_bytes(bytes) {
1271        Ok(result) => Ok(Some(result)),
1272        Err(err) => match config.analysis.decode_failure_behavior {
1273            FailureBehavior::WarnSkip => {
1274                // Caller will handle the None as a SkippedDecodeError record.
1275                // We use a sentinel: return Ok(None) but encode the error into a field.
1276                // Instead, propagate as a skipped record via the caller.
1277                let _ = (path, root, size_bytes); // suppress unused warnings
1278                Err(anyhow::anyhow!("__decode_warn__: {err}"))
1279            }
1280            FailureBehavior::Fail => {
1281                anyhow::bail!("decode failure for {}: {err}", path.display())
1282            }
1283        },
1284    }
1285}
1286
1287#[allow(clippy::too_many_lines)]
1288fn analyze_candidate_file(
1289    path: &Path,
1290    root: &Path,
1291    config: &AppConfig,
1292    include_globs: Option<&GlobSet>,
1293    exclude_globs: Option<&GlobSet>,
1294    enabled_languages: Option<&BTreeSet<Language>>,
1295) -> Result<Option<FileRecord>> {
1296    let metadata = match fs::symlink_metadata(path) {
1297        Ok(metadata) => metadata,
1298        Err(err) => {
1299            return Ok(Some(skipped_record(
1300                path,
1301                root,
1302                0,
1303                FileStatus::ErrorInternal,
1304                vec![format!("failed to read metadata: {err}")],
1305            )));
1306        }
1307    };
1308
1309    let relative_path = relative_path_string(path, root);
1310
1311    // Metadata-level policy checks.
1312    match check_metadata_policy(
1313        path,
1314        root,
1315        &relative_path,
1316        &metadata,
1317        config,
1318        include_globs,
1319        exclude_globs,
1320    ) {
1321        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1322        MetadataPolicyOutcome::Exclude => return Ok(None),
1323        MetadataPolicyOutcome::Continue => {}
1324    }
1325
1326    let bytes = match fs::read(path) {
1327        Ok(bytes) => bytes,
1328        Err(err) => {
1329            return Ok(Some(skipped_record(
1330                path,
1331                root,
1332                metadata.len(),
1333                FileStatus::ErrorInternal,
1334                vec![format!("failed to read file: {err}")],
1335            )));
1336        }
1337    };
1338
1339    // Content-level policy checks (vendor, generated, minified).
1340    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1341    if let Some(record) = content_policy.skip_record {
1342        return Ok(Some(record));
1343    }
1344    let (vendor, generated, minified) = (
1345        content_policy.vendor,
1346        content_policy.generated,
1347        content_policy.minified,
1348    );
1349
1350    // Decode content, handling binary and decode failures.
1351    let (text, encoding, decode_warnings) =
1352        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1353            Ok(Some(result)) => result,
1354            Ok(None) => {
1355                return Ok(Some(skipped_record(
1356                    path,
1357                    root,
1358                    metadata.len(),
1359                    FileStatus::SkippedBinary,
1360                    vec!["binary file skipped by default".into()],
1361                )));
1362            }
1363            Err(err) => {
1364                let msg = err.to_string();
1365                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1366                    return Ok(Some(skipped_record(
1367                        path,
1368                        root,
1369                        metadata.len(),
1370                        FileStatus::SkippedDecodeError,
1371                        vec![warn_msg.to_string()],
1372                    )));
1373                }
1374                return Err(err);
1375            }
1376        };
1377
1378    let first_line = text.lines().next();
1379    let language = detect_language(
1380        path,
1381        first_line,
1382        &config.analysis.extension_overrides,
1383        config.analysis.shebang_detection,
1384    );
1385
1386    let Some(language) = language else {
1387        return Ok(Some(skipped_record(
1388            path,
1389            root,
1390            metadata.len(),
1391            FileStatus::SkippedUnsupported,
1392            vec!["unsupported or undetected language".into()],
1393        )));
1394    };
1395
1396    if let Some(enabled) = enabled_languages {
1397        if !enabled.contains(&language) {
1398            return Ok(Some(skipped_record(
1399                path,
1400                root,
1401                metadata.len(),
1402                FileStatus::SkippedByPolicy,
1403                vec![format!(
1404                    "language {} disabled by configuration",
1405                    language.display_name()
1406                )],
1407            )));
1408        }
1409    }
1410
1411    let style_scope = match config.analysis.style_lang_scope.as_str() {
1412        "c_family" => StyleLangScope::CFamilyOnly,
1413        _ => StyleLangScope::All,
1414    };
1415    let ieee_opts = AnalysisOptions {
1416        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1417            == BlankInBlockCommentPolicy::CountAsComment,
1418        collapse_continuation_lines: config.analysis.continuation_line_policy
1419            == ContinuationLinePolicy::CollapseToLogical,
1420        enable_style: config.analysis.style_analysis_enabled,
1421        style_lang_scope: style_scope,
1422    };
1423    let analysis = analyze_text(language, &text, ieee_opts);
1424    let effective_counts = compute_effective_counts(
1425        &analysis.raw,
1426        config.analysis.mixed_line_policy,
1427        config.analysis.python_docstrings_as_comments,
1428        config.analysis.count_compiler_directives,
1429    );
1430
1431    let mut warnings = decode_warnings;
1432    warnings.extend(analysis.warnings.clone());
1433
1434    Ok(Some(FileRecord {
1435        path: path_to_string(path),
1436        relative_path,
1437        language: Some(language),
1438        size_bytes: metadata.len(),
1439        detected_encoding: Some(encoding),
1440        raw_line_categories: analysis.raw,
1441        effective_counts,
1442        status: match analysis.parse_mode {
1443            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1444            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1445        },
1446        warnings,
1447        generated,
1448        minified,
1449        vendor,
1450        parse_mode: Some(analysis.parse_mode),
1451        submodule: None,
1452        coverage: None,
1453        style_analysis: analysis.style_analysis,
1454    }))
1455}
1456
1457const fn compute_effective_counts(
1458    raw: &RawLineCounts,
1459    mixed_line_policy: MixedLinePolicy,
1460    python_docstrings_as_comments: bool,
1461    count_compiler_directives: bool,
1462) -> EffectiveCounts {
1463    let mut effective = EffectiveCounts {
1464        code_lines: raw.code_only_lines,
1465        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1466        blank_lines: raw.blank_only_lines,
1467        mixed_lines_separate: 0,
1468    };
1469
1470    if python_docstrings_as_comments {
1471        effective.comment_lines += raw.docstring_comment_lines;
1472    } else {
1473        effective.code_lines += raw.docstring_comment_lines;
1474    }
1475
1476    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1477    match mixed_line_policy {
1478        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1479        MixedLinePolicy::CodeAndComment => {
1480            effective.code_lines += mixed_total;
1481            effective.comment_lines += mixed_total;
1482        }
1483        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1484        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1485    }
1486
1487    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1488    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1489    if !count_compiler_directives {
1490        effective.code_lines = effective
1491            .code_lines
1492            .saturating_sub(raw.compiler_directive_lines);
1493    }
1494
1495    effective
1496}
1497
1498fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1499    let mut summary = SummaryTotals {
1500        files_considered: (analyzed.len() + skipped.len()) as u64,
1501        files_analyzed: analyzed.len() as u64,
1502        files_skipped: skipped.len() as u64,
1503        ..Default::default()
1504    };
1505
1506    for record in analyzed {
1507        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1508        summary.code_lines += record.effective_counts.code_lines;
1509        summary.comment_lines += record.effective_counts.comment_lines;
1510        summary.blank_lines += record.effective_counts.blank_lines;
1511        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1512        summary.functions += record.raw_line_categories.functions;
1513        summary.classes += record.raw_line_categories.classes;
1514        summary.variables += record.raw_line_categories.variables;
1515        summary.imports += record.raw_line_categories.imports;
1516        summary.test_count += record.raw_line_categories.test_count;
1517        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1518        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1519        if let Some(cov) = &record.coverage {
1520            summary.coverage_lines_found += u64::from(cov.lines_found);
1521            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1522            summary.coverage_functions_found += u64::from(cov.functions_found);
1523            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1524            summary.coverage_branches_found += u64::from(cov.branches_found);
1525            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1526        }
1527    }
1528
1529    summary
1530}
1531
1532/// Construct a zero-filled `LanguageSummary` for the given language.
1533const fn zeroed_summary(language: Language) -> LanguageSummary {
1534    LanguageSummary {
1535        language,
1536        files: 0,
1537        total_physical_lines: 0,
1538        code_lines: 0,
1539        comment_lines: 0,
1540        blank_lines: 0,
1541        mixed_lines_separate: 0,
1542        functions: 0,
1543        classes: 0,
1544        variables: 0,
1545        imports: 0,
1546        test_count: 0,
1547        test_assertion_count: 0,
1548        test_suite_count: 0,
1549        coverage_lines_found: 0,
1550        coverage_lines_hit: 0,
1551        coverage_functions_found: 0,
1552        coverage_functions_hit: 0,
1553        coverage_branches_found: 0,
1554        coverage_branches_hit: 0,
1555    }
1556}
1557
1558/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1559fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1560    entry.files += 1;
1561    let r = &record.raw_line_categories;
1562    entry.total_physical_lines += r.total_physical_lines;
1563    entry.code_lines += record.effective_counts.code_lines;
1564    entry.comment_lines += record.effective_counts.comment_lines;
1565    entry.blank_lines += record.effective_counts.blank_lines;
1566    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1567    entry.functions += r.functions;
1568    entry.classes += r.classes;
1569    entry.variables += r.variables;
1570    entry.imports += r.imports;
1571    entry.test_count += r.test_count;
1572    entry.test_assertion_count += r.test_assertion_count;
1573    entry.test_suite_count += r.test_suite_count;
1574    if let Some(cov) = &record.coverage {
1575        entry.coverage_lines_found += u64::from(cov.lines_found);
1576        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1577        entry.coverage_functions_found += u64::from(cov.functions_found);
1578        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1579        entry.coverage_branches_found += u64::from(cov.branches_found);
1580        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1581    }
1582}
1583
1584fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1585    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1586    for record in analyzed {
1587        let Some(language) = record.language else {
1588            continue;
1589        };
1590        let entry = by_language
1591            .entry(language)
1592            .or_insert_with(|| zeroed_summary(language));
1593        accumulate_record_into_summary(entry, record);
1594    }
1595    by_language.into_values().collect()
1596}
1597
1598fn skipped_record(
1599    path: &Path,
1600    root: &Path,
1601    size_bytes: u64,
1602    status: FileStatus,
1603    warnings: Vec<String>,
1604) -> FileRecord {
1605    FileRecord {
1606        path: path_to_string(path),
1607        relative_path: relative_path_string(path, root),
1608        language: None,
1609        size_bytes,
1610        detected_encoding: None,
1611        raw_line_categories: RawLineCounts::default(),
1612        effective_counts: EffectiveCounts::default(),
1613        status,
1614        warnings,
1615        generated: false,
1616        minified: false,
1617        vendor: false,
1618        parse_mode: None,
1619        submodule: None,
1620        coverage: None,
1621        style_analysis: None,
1622    }
1623}
1624
1625fn relative_path_string(path: &Path, root: &Path) -> String {
1626    path.strip_prefix(root)
1627        .unwrap_or(path)
1628        .to_string_lossy()
1629        .replace('\\', "/")
1630}
1631
1632fn path_to_string(path: &Path) -> String {
1633    path.to_string_lossy().replace('\\', "/")
1634}
1635
1636/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1637#[must_use]
1638pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1639    let gitmodules = root.join(".gitmodules");
1640    if !gitmodules.is_file() {
1641        return Vec::new();
1642    }
1643    let Ok(content) = fs::read_to_string(&gitmodules) else {
1644        return Vec::new();
1645    };
1646
1647    let mut result = Vec::new();
1648    let mut current_name: Option<String> = None;
1649    let mut current_path: Option<PathBuf> = None;
1650
1651    for line in content.lines() {
1652        let trimmed = line.trim();
1653        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1654            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1655                result.push((name, path));
1656            }
1657            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1658            current_name = Some(name);
1659        } else if let Some(rest) = trimmed.strip_prefix("path") {
1660            if let Some(eq_pos) = rest.find('=') {
1661                let path_str = rest[eq_pos + 1..].trim();
1662                current_path = Some(PathBuf::from(path_str));
1663            }
1664        }
1665    }
1666    if let (Some(name), Some(path)) = (current_name, current_path) {
1667        result.push((name, path));
1668    }
1669
1670    result
1671}
1672
1673fn build_submodule_summaries(
1674    analyzed: &[FileRecord],
1675    submodules: &[(String, PathBuf)],
1676    root: &Path,
1677) -> Vec<SubmoduleSummary> {
1678    submodules
1679        .iter()
1680        .map(|(name, path)| {
1681            let files: Vec<&FileRecord> = analyzed
1682                .iter()
1683                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1684                .collect();
1685
1686            let files_analyzed = files.len() as u64;
1687            let total_physical_lines = files
1688                .iter()
1689                .map(|f| f.raw_line_categories.total_physical_lines)
1690                .sum();
1691            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1692            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1693            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1694            let language_summaries = build_language_summaries_from_slice(&files);
1695
1696            let git = detect_git_for_run(&root.join(path));
1697
1698            SubmoduleSummary {
1699                name: name.clone(),
1700                relative_path: path.to_string_lossy().replace('\\', "/"),
1701                files_analyzed,
1702                total_physical_lines,
1703                code_lines,
1704                comment_lines,
1705                blank_lines,
1706                language_summaries,
1707                git_commit_short: git.commit_short,
1708                git_commit_long: git.commit_long,
1709                git_branch: git.branch,
1710                git_commit_author: git.author,
1711                git_commit_date: git.commit_date,
1712                git_remote_url: git.remote_url,
1713            }
1714        })
1715        .filter(|s| s.files_analyzed > 0)
1716        .collect()
1717}
1718
1719/// Dominant indent label from vote counts.
1720#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1721fn dominant_indent_label(files: &[&StyleAnalysis]) -> String {
1722    let mut votes = [0u32; 6];
1723    for f in files {
1724        let idx = match f.indent_style {
1725            IndentStyle::Tabs => 0,
1726            IndentStyle::Spaces2 => 1,
1727            IndentStyle::Spaces4 => 2,
1728            IndentStyle::Spaces8 => 3,
1729            IndentStyle::Mixed => 4,
1730            IndentStyle::Unknown => 5,
1731        };
1732        votes[idx] += 1;
1733    }
1734    let labels = ["Tabs", "2-Space", "4-Space", "8-Space", "Mixed", "\u{2014}"];
1735    labels[votes
1736        .iter()
1737        .enumerate()
1738        .max_by_key(|(_, v)| *v)
1739        .map_or(5, |(i, _)| i)]
1740    .to_string()
1741}
1742
1743/// Line-80 compliance percentage for a slice of style analyses.
1744#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1745fn line80_pct(files: &[&StyleAnalysis]) -> u8 {
1746    if files.is_empty() {
1747        return 0;
1748    }
1749    let compliant = files
1750        .iter()
1751        .filter(|f| f.total_lines == 0 || (f.lines_over_80 as f32 / f.total_lines as f32) <= 0.05)
1752        .count() as u32;
1753    ((compliant * 100) / files.len() as u32) as u8
1754}
1755
1756/// Column-N compliance percentage using the configured threshold (80, 100, or 120).
1757/// Falls back to the 80-col bucket for any threshold ≤ 80.
1758#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1759fn line_col_pct(files: &[&StyleAnalysis], threshold: u16) -> u8 {
1760    if files.is_empty() {
1761        return 0;
1762    }
1763    let compliant = files
1764        .iter()
1765        .filter(|f| {
1766            let over = if threshold <= 80 {
1767                f.lines_over_80
1768            } else if threshold <= 100 {
1769                f.lines_over_100
1770            } else {
1771                f.lines_over_120
1772            };
1773            f.total_lines == 0 || (over as f32 / f.total_lines as f32) <= 0.05
1774        })
1775        .count() as u32;
1776    ((compliant * 100) / files.len() as u32) as u8
1777}
1778
1779/// Build a `LanguageStyleGroup` from a non-empty slice of `StyleAnalysis` for one family.
1780#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1781fn build_language_group(
1782    family: &str,
1783    files: &[&StyleAnalysis],
1784    col_threshold: u16,
1785) -> LanguageStyleGroup {
1786    let count = files.len() as u32;
1787
1788    // Collect every unique guide name across all files in this group.
1789    let mut all_names: Vec<String> = Vec::new();
1790    for f in files {
1791        for g in &f.guide_scores {
1792            if !all_names.contains(&g.name) {
1793                all_names.push(g.name.clone());
1794            }
1795        }
1796    }
1797
1798    let mut guide_avg_scores: Vec<(String, u8)> = all_names
1799        .into_iter()
1800        .map(|name| {
1801            let sum: u32 = files
1802                .iter()
1803                .filter_map(|f| f.guide_scores.iter().find(|g| g.name == name))
1804                .map(|g| u32::from(g.score_pct))
1805                .sum();
1806            let avg = (sum / count) as u8;
1807            (name, avg)
1808        })
1809        .collect();
1810    guide_avg_scores.sort_by_key(|s| std::cmp::Reverse(s.1));
1811
1812    let (dominant_guide, dominant_score_pct) = guide_avg_scores
1813        .first()
1814        .map(|(n, s)| (n.clone(), *s))
1815        .unwrap_or_default();
1816
1817    let lcp = line_col_pct(files, col_threshold);
1818    LanguageStyleGroup {
1819        language_family: family.to_string(),
1820        files_count: count,
1821        dominant_guide,
1822        dominant_score_pct,
1823        common_indent_style: dominant_indent_label(files),
1824        guide_avg_scores,
1825        line80_compliant_pct: line80_pct(files),
1826        line_col_compliant_pct: lcp,
1827    }
1828}
1829
1830/// Build aggregate multi-language style-guide adherence.
1831/// Returns `None` when no files had style data.
1832#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1833fn build_style_summary(analyzed: &[FileRecord], col_threshold: u16) -> Option<StyleSummary> {
1834    let all_style: Vec<&StyleAnalysis> = analyzed
1835        .iter()
1836        .filter_map(|f| f.style_analysis.as_ref())
1837        .collect();
1838
1839    if all_style.is_empty() {
1840        return None;
1841    }
1842
1843    // Group by language_family.
1844    let mut families: std::collections::BTreeMap<&str, Vec<&StyleAnalysis>> =
1845        std::collections::BTreeMap::new();
1846    for sa in &all_style {
1847        families
1848            .entry(sa.language_family.as_str())
1849            .or_default()
1850            .push(sa);
1851    }
1852
1853    let mut by_language: Vec<LanguageStyleGroup> = families
1854        .iter()
1855        .map(|(family, files)| build_language_group(family, files, col_threshold))
1856        .collect();
1857    by_language.sort_by_key(|g| std::cmp::Reverse(g.files_count));
1858
1859    let files_analyzed = all_style.len() as u32;
1860    let common_indent_style = dominant_indent_label(&all_style);
1861    let line80_compliant_pct = line80_pct(&all_style);
1862    let line_col_compliant_pct = line_col_pct(&all_style, col_threshold);
1863
1864    Some(StyleSummary {
1865        files_analyzed,
1866        common_indent_style,
1867        line80_compliant_pct,
1868        line_col_compliant_pct,
1869        col_threshold,
1870        by_language,
1871    })
1872}
1873
1874fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1875    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1876    for file in files {
1877        let Some(lang) = file.language else { continue };
1878        let entry = map
1879            .entry(lang.display_name().to_string())
1880            .or_insert_with(|| zeroed_summary(lang));
1881        accumulate_record_into_summary(entry, file);
1882    }
1883    map.into_values().collect()
1884}
1885
1886fn file_name_eq(path: &Path, expected: &str) -> bool {
1887    path.file_name()
1888        .and_then(|name| name.to_str())
1889        .is_some_and(|name| name == expected)
1890}
1891
1892fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1893    path.components().any(|component| {
1894        component
1895            .as_os_str()
1896            .to_str()
1897            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1898    })
1899}
1900
1901fn is_vendor_path(path: &Path) -> bool {
1902    path.components().any(|component| {
1903        component
1904            .as_os_str()
1905            .to_str()
1906            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1907    })
1908}
1909
1910fn is_known_lockfile(path: &Path) -> bool {
1911    path.file_name()
1912        .and_then(|name| name.to_str())
1913        .is_some_and(|name| {
1914            matches!(
1915                name,
1916                "Cargo.lock"
1917                    | "package-lock.json"
1918                    | "yarn.lock"
1919                    | "pnpm-lock.yaml"
1920                    | "Pipfile.lock"
1921                    | "poetry.lock"
1922                    | "composer.lock"
1923            )
1924        })
1925}
1926
1927fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1928    let file_name = path
1929        .file_name()
1930        .and_then(|name| name.to_str())
1931        .unwrap_or_default();
1932    if file_name.contains(".generated.") || file_name.contains(".g.") {
1933        return true;
1934    }
1935
1936    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1937        .to_ascii_lowercase();
1938    sample.contains("@generated") || sample.contains("generated by")
1939}
1940
1941fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1942    let file_name = path
1943        .file_name()
1944        .and_then(|name| name.to_str())
1945        .unwrap_or_default();
1946    if file_name.contains(".min.") {
1947        return true;
1948    }
1949
1950    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1951    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1952    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1953    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1954}
1955
1956fn is_binary(bytes: &[u8]) -> bool {
1957    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1958        || bytes.starts_with(&[0xFF, 0xFE])
1959        || bytes.starts_with(&[0xFE, 0xFF])
1960    {
1961        return false;
1962    }
1963
1964    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1965    sample.contains(&0)
1966}
1967
1968/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
1969/// Returns `(text, encoding_label, warnings)`.
1970fn decode_utf16_bom(
1971    bom_stripped: &[u8],
1972    encoding: &'static encoding_rs::Encoding,
1973    label: &str,
1974) -> (String, String, Vec<String>) {
1975    let (cow, _, had_errors) = encoding.decode(bom_stripped);
1976    let mut warnings = Vec::new();
1977    if had_errors {
1978        warnings.push(format!("{label} decode contained replacement characters"));
1979    }
1980    (cow.into_owned(), label.into(), warnings)
1981}
1982
1983fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1984    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1985        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1986        return Ok((text, "utf-8-bom".into(), vec![]));
1987    }
1988    if bytes.starts_with(&[0xFF, 0xFE]) {
1989        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1990    }
1991    if bytes.starts_with(&[0xFE, 0xFF]) {
1992        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1993    }
1994
1995    // Multiple statements in the else branch make map_or_else awkward here.
1996    #[allow(clippy::option_if_let_else)]
1997    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1998        Ok((text, "utf-8".into(), vec![]))
1999    } else {
2000        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
2001        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
2002        if had_errors {
2003            warnings.push("fallback decode contained replacement characters".into());
2004        }
2005        Ok((cow.into_owned(), "windows-1252".into(), warnings))
2006    }
2007}
2008
2009fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
2010    if patterns.is_empty() {
2011        return Ok(None);
2012    }
2013
2014    let mut builder = GlobSetBuilder::new();
2015    for pattern in patterns {
2016        builder
2017            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
2018    }
2019    Ok(Some(
2020        builder.build().context("failed to compile glob filters")?,
2021    ))
2022}
2023
2024fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
2025    if enabled.is_empty() {
2026        return Ok(None);
2027    }
2028
2029    let supported = supported_languages();
2030    let mut set = BTreeSet::new();
2031    for name in enabled {
2032        let language = Language::from_name(name)
2033            .with_context(|| format!("unsupported language in config: {name}"))?;
2034        if !supported.contains(&language) {
2035            anyhow::bail!("language {name} is not supported in this build");
2036        }
2037        set.insert(language);
2038    }
2039    Ok(Some(set))
2040}
2041
2042/// # Errors
2043///
2044/// Returns an error if serialization fails or the output file cannot be written.
2045pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
2046    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
2047    fs::write(output_path, json)
2048        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
2049}
2050
2051/// # Errors
2052///
2053/// Returns an error if the file cannot be read or the JSON cannot be parsed.
2054pub fn read_json(path: &Path) -> Result<AnalysisRun> {
2055    let contents = fs::read_to_string(path)
2056        .with_context(|| format!("failed to read result file {}", path.display()))?;
2057    serde_json::from_str(&contents)
2058        .with_context(|| format!("failed to parse JSON result {}", path.display()))
2059}
2060
2061#[cfg(test)]
2062mod tests {
2063    use super::*;
2064
2065    #[test]
2066    fn effective_counts_respect_code_only_policy() {
2067        let raw = RawLineCounts {
2068            code_only_lines: 2,
2069            single_comment_only_lines: 1,
2070            mixed_code_single_comment_lines: 3,
2071            docstring_comment_lines: 2,
2072            ..RawLineCounts::default()
2073        };
2074        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
2075        assert_eq!(counts.code_lines, 5);
2076        assert_eq!(counts.comment_lines, 3);
2077    }
2078
2079    #[test]
2080    fn effective_counts_can_separate_mixed() {
2081        let raw = RawLineCounts {
2082            mixed_code_single_comment_lines: 2,
2083            mixed_code_multi_comment_lines: 1,
2084            ..RawLineCounts::default()
2085        };
2086        let counts =
2087            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
2088        assert_eq!(counts.mixed_lines_separate, 3);
2089        assert_eq!(counts.code_lines, 0);
2090        assert_eq!(counts.comment_lines, 0);
2091    }
2092
2093    #[test]
2094    fn windows_1252_fallback_decodes() {
2095        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
2096        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
2097        assert_eq!(encoding, "windows-1252");
2098        assert!(text.contains('–'));
2099        assert!(!warnings.is_empty());
2100    }
2101
2102    // ── Pure predicate tests ─────────────────────────────────────────────────
2103
2104    #[test]
2105    fn is_binary_detects_null_byte() {
2106        let bytes = b"hello\x00world";
2107        assert!(is_binary(bytes));
2108    }
2109
2110    #[test]
2111    fn is_binary_clean_text_is_not_binary() {
2112        let bytes = b"fn main() { println!(\"hello\"); }";
2113        assert!(!is_binary(bytes));
2114    }
2115
2116    #[test]
2117    fn is_binary_utf8_bom_not_binary() {
2118        let bytes = b"\xef\xbb\xbffn main() {}";
2119        assert!(!is_binary(bytes));
2120    }
2121
2122    #[test]
2123    fn looks_generated_at_generated_marker() {
2124        let bytes = b"// @generated by protoc-gen-rust\nfn foo() {}";
2125        assert!(looks_generated(Path::new("foo.rs"), bytes));
2126    }
2127
2128    #[test]
2129    fn looks_generated_do_not_edit_marker() {
2130        // "Code generated by" triggers detection (contains the "generated by" substring).
2131        let bytes = b"// Code generated by build.rs. DO NOT EDIT.\nuse foo;";
2132        assert!(looks_generated(Path::new("foo.rs"), bytes));
2133        // @generated also triggers detection independently.
2134        let bytes2 = b"// @generated\nuse foo;";
2135        assert!(looks_generated(Path::new("foo.rs"), bytes2));
2136    }
2137
2138    #[test]
2139    fn looks_generated_normal_file_not_generated() {
2140        let bytes = b"fn main() {\n    println!(\"hello\");\n}\n";
2141        assert!(!looks_generated(Path::new("main.rs"), bytes));
2142    }
2143
2144    #[test]
2145    fn looks_minified_dot_min_filename() {
2146        let bytes = b"function a(){return 1}";
2147        assert!(looks_minified(Path::new("bundle.min.js"), bytes));
2148    }
2149
2150    #[test]
2151    fn looks_minified_normal_file_not_minified() {
2152        let bytes = b"function hello() {\n    return 1;\n}\n";
2153        assert!(!looks_minified(Path::new("app.js"), bytes));
2154    }
2155
2156    #[test]
2157    fn looks_minified_very_long_line() {
2158        let long_line: Vec<u8> = b"x".repeat(MINIFIED_LINE_THRESHOLD + 1);
2159        assert!(looks_minified(Path::new("app.js"), &long_line));
2160    }
2161
2162    #[test]
2163    fn is_known_lockfile_cargo_lock() {
2164        assert!(is_known_lockfile(Path::new("Cargo.lock")));
2165    }
2166
2167    #[test]
2168    fn is_known_lockfile_package_lock_json() {
2169        assert!(is_known_lockfile(Path::new("package-lock.json")));
2170    }
2171
2172    #[test]
2173    fn is_known_lockfile_yarn_lock() {
2174        assert!(is_known_lockfile(Path::new("yarn.lock")));
2175    }
2176
2177    #[test]
2178    fn is_known_lockfile_normal_file_is_not_lockfile() {
2179        assert!(!is_known_lockfile(Path::new("src/lib.rs")));
2180    }
2181
2182    #[test]
2183    fn is_vendor_path_node_modules() {
2184        assert!(is_vendor_path(Path::new("node_modules/react/index.js")));
2185    }
2186
2187    #[test]
2188    fn is_vendor_path_vendor_dir() {
2189        assert!(is_vendor_path(Path::new("vendor/anyhow/src/lib.rs")));
2190    }
2191
2192    #[test]
2193    fn is_vendor_path_normal_src_is_not_vendor() {
2194        assert!(!is_vendor_path(Path::new("src/lib.rs")));
2195    }
2196
2197    #[test]
2198    fn is_excluded_dir_path_matches_excluded() {
2199        let excluded = vec![".git".into(), "target".into()];
2200        assert!(is_excluded_dir_path(Path::new(".git/config"), &excluded));
2201    }
2202
2203    #[test]
2204    fn is_excluded_dir_path_non_excluded_is_ok() {
2205        let excluded = vec![".git".into(), "target".into()];
2206        assert!(!is_excluded_dir_path(Path::new("src/main.rs"), &excluded));
2207    }
2208
2209    #[test]
2210    fn decode_bytes_utf8_bom_stripped() {
2211        let bytes = b"\xef\xbb\xbffn main() {}";
2212        let (text, encoding, _) = decode_bytes(bytes).unwrap();
2213        // BOM is detected — encoding label includes "bom" indicator
2214        assert!(
2215            encoding.contains("utf-8"),
2216            "should be utf-8 variant, got {encoding}"
2217        );
2218        assert!(text.starts_with("fn"));
2219    }
2220
2221    #[test]
2222    fn decode_bytes_plain_utf8() {
2223        let bytes = b"hello world";
2224        let (text, encoding, warnings) = decode_bytes(bytes).unwrap();
2225        assert_eq!(encoding, "utf-8");
2226        assert_eq!(text, "hello world");
2227        assert!(warnings.is_empty());
2228    }
2229}