Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{
13    CleanupPolicy, CleanupPolicyStore, RegistryEntry, ScanRegistry, ScanSummarySnapshot,
14    WatchedDirsStore,
15};
16
17use std::collections::{BTreeMap, BTreeSet, HashSet};
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
21use std::sync::Arc;
22
23use anyhow::{Context, Result};
24use chrono::{DateTime, Utc};
25use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
26use globset::{Glob, GlobSet, GlobSetBuilder};
27use ignore::WalkBuilder;
28use serde::{Deserialize, Serialize};
29use uuid::Uuid;
30
31use sloc_config::{
32    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
33    FailureBehavior, MixedLinePolicy,
34};
35use sloc_languages::style::IndentStyle;
36use sloc_languages::{
37    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
38    RawLineCounts, StyleAnalysis, StyleLangScope,
39};
40
41// ── Detection sample sizes and thresholds ────────────────────────────────────
42
43/// Maximum number of worker threads used for parallel file analysis.
44const MAX_ANALYSIS_THREADS: usize = 16;
45/// Fallback thread count when `available_parallelism` is unavailable.
46const DEFAULT_ANALYSIS_THREADS: usize = 4;
47/// Byte sample used to detect `@generated` markers.
48const GENERATED_SAMPLE_BYTES: usize = 1024;
49/// Byte sample used to detect minified files via line-length heuristic.
50const MINIFIED_SAMPLE_BYTES: usize = 4096;
51/// Longest line length above which a file is considered minified.
52const MINIFIED_LINE_THRESHOLD: usize = 2000;
53/// Byte sample used to detect binary files via null-byte scan.
54const BINARY_SAMPLE_BYTES: usize = 8192;
55
56/// Atomics shared between `analyze()` and the caller so the caller can poll scan progress.
57pub struct ProgressCounters {
58    /// Number of candidate files processed so far (incremented per file, across all threads).
59    pub files_done: Arc<AtomicUsize>,
60    /// Total candidate files discovered (set before parallel analysis begins).
61    pub files_total: Arc<AtomicUsize>,
62}
63
64/// Three-way outcome for metadata-level policy checks.
65enum MetadataPolicyOutcome {
66    /// Skip this file — include the record in output.
67    Skip(Box<FileRecord>),
68    /// Exclude this file entirely — no record in output (include-glob miss).
69    Exclude,
70    /// Continue to content checks.
71    Continue,
72}
73
74#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum FileStatus {
77    AnalyzedExact,
78    AnalyzedBestEffort,
79    SkippedBinary,
80    SkippedDecodeError,
81    SkippedUnsupported,
82    SkippedByPolicy,
83    ErrorInternal,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize, Default)]
87pub struct EffectiveCounts {
88    pub code_lines: u64,
89    pub comment_lines: u64,
90    pub blank_lines: u64,
91    pub mixed_lines_separate: u64,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct ToolMetadata {
96    pub name: String,
97    pub version: String,
98    pub run_id: String,
99    pub timestamp_utc: DateTime<Utc>,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct EnvironmentMetadata {
104    pub operating_system: String,
105    pub architecture: String,
106    pub runtime_mode: String,
107    pub initiator_username: String,
108    pub initiator_hostname: String,
109    /// CI system name when the scan runs inside a known CI environment (Jenkins,
110    /// GitHub Actions, GitLab CI, …). `None` for interactive / local runs.
111    #[serde(default, skip_serializing_if = "Option::is_none")]
112    pub ci_name: Option<String>,
113}
114
115#[derive(Debug, Clone, Serialize, Deserialize, Default)]
116pub struct SummaryTotals {
117    pub files_considered: u64,
118    pub files_analyzed: u64,
119    pub files_skipped: u64,
120    pub total_physical_lines: u64,
121    pub code_lines: u64,
122    pub comment_lines: u64,
123    pub blank_lines: u64,
124    pub mixed_lines_separate: u64,
125    #[serde(default)]
126    pub functions: u64,
127    #[serde(default)]
128    pub classes: u64,
129    #[serde(default)]
130    pub variables: u64,
131    #[serde(default)]
132    pub imports: u64,
133    #[serde(default)]
134    pub test_count: u64,
135    /// Lexically detected test assertion call lines across all analyzed files.
136    #[serde(default)]
137    pub test_assertion_count: u64,
138    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
139    #[serde(default)]
140    pub test_suite_count: u64,
141    /// Aggregated from LCOV data when provided.
142    #[serde(default)]
143    pub coverage_lines_found: u64,
144    #[serde(default)]
145    pub coverage_lines_hit: u64,
146    #[serde(default)]
147    pub coverage_functions_found: u64,
148    #[serde(default)]
149    pub coverage_functions_hit: u64,
150    #[serde(default)]
151    pub coverage_branches_found: u64,
152    #[serde(default)]
153    pub coverage_branches_hit: u64,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct LanguageSummary {
158    pub language: Language,
159    pub files: u64,
160    pub total_physical_lines: u64,
161    pub code_lines: u64,
162    pub comment_lines: u64,
163    pub blank_lines: u64,
164    pub mixed_lines_separate: u64,
165    #[serde(default)]
166    pub functions: u64,
167    #[serde(default)]
168    pub classes: u64,
169    #[serde(default)]
170    pub variables: u64,
171    #[serde(default)]
172    pub imports: u64,
173    #[serde(default)]
174    pub test_count: u64,
175    #[serde(default)]
176    pub test_assertion_count: u64,
177    #[serde(default)]
178    pub test_suite_count: u64,
179    #[serde(default)]
180    pub coverage_lines_found: u64,
181    #[serde(default)]
182    pub coverage_lines_hit: u64,
183    #[serde(default)]
184    pub coverage_functions_found: u64,
185    #[serde(default)]
186    pub coverage_functions_hit: u64,
187    #[serde(default)]
188    pub coverage_branches_found: u64,
189    #[serde(default)]
190    pub coverage_branches_hit: u64,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct FileRecord {
195    pub path: String,
196    pub relative_path: String,
197    pub language: Option<Language>,
198    pub size_bytes: u64,
199    pub detected_encoding: Option<String>,
200    pub raw_line_categories: RawLineCounts,
201    pub effective_counts: EffectiveCounts,
202    pub status: FileStatus,
203    pub warnings: Vec<String>,
204    pub generated: bool,
205    pub minified: bool,
206    pub vendor: bool,
207    pub parse_mode: Option<ParseMode>,
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub submodule: Option<String>,
210    /// Line/function/branch coverage from an external LCOV file, when provided.
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub coverage: Option<FileCoverage>,
213    /// Lexical style-guide adherence analysis; `None` for unsupported languages.
214    #[serde(default, skip_serializing_if = "Option::is_none")]
215    pub style_analysis: Option<StyleAnalysis>,
216}
217
218/// Per-language-family style aggregation within a `StyleSummary`.
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct LanguageStyleGroup {
221    /// Display label, e.g. `"C / C++"`, `"Python"`, `"JavaScript"`.
222    pub language_family: String,
223    /// Number of files in this group.
224    pub files_count: u32,
225    /// Name of the guide with the highest average adherence.
226    pub dominant_guide: String,
227    /// Average adherence of the dominant guide (0–100).
228    pub dominant_score_pct: u8,
229    /// Most common indent style across the group.
230    pub common_indent_style: String,
231    /// Average guide adherence scores (guide name, 0–100) sorted descending.
232    pub guide_avg_scores: Vec<(String, u8)>,
233    /// Percentage of files (0–100) where ≤ 5 % of lines exceed the configured column threshold.
234    pub line80_compliant_pct: u8,
235    /// Same as `line80_compliant_pct` but named for the actual configured threshold.
236    pub line_col_compliant_pct: u8,
237}
238
239/// Aggregate multi-language style-guide adherence across all analysed files.
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct StyleSummary {
242    /// Total files for which style data was produced.
243    pub files_analyzed: u32,
244    /// Most common indent style across *all* analysed files.
245    pub common_indent_style: String,
246    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over 80 chars (legacy, always 80).
247    pub line80_compliant_pct: u8,
248    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over `col_threshold` chars.
249    pub line_col_compliant_pct: u8,
250    /// Column-width threshold used for `line_col_compliant_pct` (from `analysis.style_col_threshold`).
251    pub col_threshold: u16,
252    /// Per-language-family breakdown, sorted by `files_count` descending.
253    pub by_language: Vec<LanguageStyleGroup>,
254}
255
256/// Backward-compatible alias kept so that `sloc-report` and `sloc-web` can migrate
257/// incrementally without a breaking change on the same release.
258pub type CppStyleSummary = StyleSummary;
259
260/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
261#[derive(Debug, Clone, Serialize, Deserialize)]
262pub struct SubmoduleSummary {
263    pub name: String,
264    pub relative_path: String,
265    pub files_analyzed: u64,
266    pub total_physical_lines: u64,
267    pub code_lines: u64,
268    pub comment_lines: u64,
269    pub blank_lines: u64,
270    pub language_summaries: Vec<LanguageSummary>,
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct AnalysisRun {
275    pub tool: ToolMetadata,
276    pub environment: EnvironmentMetadata,
277    pub effective_configuration: AppConfig,
278    pub input_roots: Vec<String>,
279    pub summary_totals: SummaryTotals,
280    pub totals_by_language: Vec<LanguageSummary>,
281    pub per_file_records: Vec<FileRecord>,
282    pub skipped_file_records: Vec<FileRecord>,
283    pub warnings: Vec<String>,
284    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
285    #[serde(default, skip_serializing_if = "Vec::is_empty")]
286    pub submodule_summaries: Vec<SubmoduleSummary>,
287    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
288    #[serde(default, skip_serializing_if = "Option::is_none")]
289    pub git_commit_short: Option<String>,
290    /// Full git commit SHA at scan time, if the project is a git repo.
291    #[serde(default, skip_serializing_if = "Option::is_none")]
292    pub git_commit_long: Option<String>,
293    /// Git branch active at scan time, if the project is a git repo.
294    #[serde(default, skip_serializing_if = "Option::is_none")]
295    pub git_branch: Option<String>,
296    /// Author of the last git commit at scan time.
297    #[serde(default, skip_serializing_if = "Option::is_none")]
298    pub git_commit_author: Option<String>,
299    /// Comma-separated git tags pointing at HEAD at scan time.
300    #[serde(default, skip_serializing_if = "Option::is_none")]
301    pub git_tags: Option<String>,
302    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
303    #[serde(default, skip_serializing_if = "Option::is_none")]
304    pub git_nearest_tag: Option<String>,
305    /// ISO 8601 author-date of the last git commit at scan time.
306    #[serde(default, skip_serializing_if = "Option::is_none")]
307    pub git_commit_date: Option<String>,
308    /// URL of the `origin` remote as recorded in `.git/config` at scan time.
309    #[serde(default, skip_serializing_if = "Option::is_none")]
310    pub git_remote_url: Option<String>,
311    /// Multi-language style-guide adherence; `None` when no supported files were analysed.
312    #[serde(default, skip_serializing_if = "Option::is_none")]
313    pub style_summary: Option<StyleSummary>,
314}
315
316#[derive(Default)]
317struct GitInfo {
318    commit_short: Option<String>,
319    commit_long: Option<String>,
320    branch: Option<String>,
321    author: Option<String>,
322    tags: Option<String>,
323    nearest_tag: Option<String>,
324    commit_date: Option<String>,
325    remote_url: Option<String>,
326}
327
328/// Locate the `.git` directory by walking up from `start`.
329/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
330/// submodules. Returns `None` if no git repo is found.
331fn find_git_dir(start: &Path) -> Option<PathBuf> {
332    let mut current = Some(start);
333    while let Some(dir) = current {
334        let candidate = dir.join(".git");
335        if candidate.is_dir() {
336            return Some(candidate);
337        }
338        if candidate.is_file() {
339            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
340                return Some(resolved);
341            }
342        }
343        current = dir.parent();
344    }
345    None
346}
347
348/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
349/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
350/// or if the resolved path is not an existing directory.
351fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
352    let content = fs::read_to_string(file).ok()?;
353    let ptr = content.trim().strip_prefix("gitdir: ")?;
354    // Normalise forward-slash paths to the OS separator so that Path operations
355    // (join, exists, canonicalize) work correctly on Windows.
356    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
357    let resolved = if Path::new(&ptr_native).is_absolute() {
358        PathBuf::from(&ptr_native)
359    } else {
360        base_dir.join(&ptr_native)
361    };
362    // canonicalize resolves ".." components and symlinks; fall back to the
363    // un-canonicalized path if it fails (e.g. some Windows configurations
364    // return a UNC "\\?\" prefix that confuses later path operations).
365    let final_path = resolved.canonicalize().unwrap_or(resolved);
366    if final_path.is_dir() {
367        Some(final_path)
368    } else {
369        None
370    }
371}
372
373/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
374/// Checks loose ref files first, then `packed-refs`.
375fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
376    // Build the OS-native path to the loose ref file by joining each
377    // forward-slash component individually.  This produces the correct
378    // separator on every platform without any manual replacement.
379    let ref_path = refname
380        .split('/')
381        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
382    if ref_path.exists() {
383        let sha = fs::read_to_string(&ref_path)
384            .ok()
385            .map(|s| s.trim().to_string())
386            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
387        if sha.is_some() {
388            return sha;
389        }
390    }
391    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
392    // comments; lines starting with '^' are peeled tag objects to skip).
393    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
394    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
395    for line in packed.lines() {
396        if line.starts_with('#') || line.starts_with('^') {
397            continue;
398        }
399        let mut cols = line.splitn(2, ' ');
400        let sha = cols.next()?;
401        let name = cols.next()?.trim();
402        if name == refname {
403            return Some(sha.to_string());
404        }
405    }
406    None
407}
408
409/// Extract the URL value from a `url = <value>` git-config line, returning `None` if absent or empty.
410fn parse_url_line(line: &str) -> Option<&str> {
411    let rest = line.strip_prefix("url")?;
412    let rest = rest.trim_start_matches([' ', '\t']);
413    let url = rest.strip_prefix('=')?.trim();
414    if url.is_empty() {
415        None
416    } else {
417        Some(url)
418    }
419}
420
421/// Parse `.git/config` and return the URL of the `origin` remote, if present.
422fn read_git_remote_url(git_dir: &Path) -> Option<String> {
423    let config = fs::read_to_string(git_dir.join("config")).ok()?;
424    let mut in_origin = false;
425    for line in config.lines() {
426        let trimmed = line.trim();
427        if trimmed.starts_with('[') {
428            in_origin = trimmed == r#"[remote "origin"]"#;
429        } else if in_origin {
430            if let Some(url) = parse_url_line(trimmed) {
431                return Some(url.to_owned());
432            }
433        }
434    }
435    None
436}
437
438/// Detect git metadata by reading `.git/` files directly — no `git` executable
439/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
440/// reflogs.
441fn detect_git_for_run(project_path: &Path) -> GitInfo {
442    // Resolve the CI branch early so it can fill in any gap in git metadata.
443    let ci_branch = ci_branch_from_env();
444
445    let Some(git_dir) = find_git_dir(project_path) else {
446        // No .git directory (e.g. scanning a non-repo path in CI). Use whatever
447        // the CI system tells us about the branch.
448        return GitInfo {
449            branch: ci_branch,
450            ..GitInfo::default()
451        };
452    };
453
454    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
455        Ok(s) => s.trim().to_string(),
456        Err(_) => {
457            return GitInfo {
458                branch: ci_branch,
459                ..GitInfo::default()
460            }
461        }
462    };
463
464    let (branch_from_head, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
465        || {
466            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
467                // Detached HEAD — HEAD file is the commit SHA (common in CI checkouts).
468                (None, Some(head_raw[..40].to_string()))
469            } else {
470                (None, None)
471            }
472        },
473        |refname| {
474            let branch = refname
475                .strip_prefix("refs/heads/")
476                .map(|b| b.trim().to_string());
477            let sha = resolve_ref(&git_dir, refname.trim());
478            (branch, sha)
479        },
480    );
481    // Prefer the branch name derived from the HEAD ref; fall back to the CI
482    // env var (covers detached-HEAD checkouts done by Jenkins, GitHub Actions, etc.).
483    let branch = branch_from_head.or(ci_branch);
484
485    let commit_short = commit_long
486        .as_deref()
487        .map(|s| s.chars().take(7).collect::<String>());
488
489    let author = run_git_cmd(project_path, &["log", "-1", "--format=%an", "HEAD"]);
490    let commit_date = run_git_cmd(project_path, &["log", "-1", "--format=%aI", "HEAD"]);
491    let remote_url = read_git_remote_url(&git_dir);
492
493    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
494    // but don't block on it. If git isn't available these will simply be None.
495    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
496        t.lines()
497            .filter(|l| !l.is_empty())
498            .collect::<Vec<_>>()
499            .join(", ")
500    });
501    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
502
503    GitInfo {
504        commit_short,
505        commit_long,
506        branch,
507        author,
508        tags,
509        nearest_tag,
510        commit_date,
511        remote_url,
512    }
513}
514
515/// Run a git command as a best-effort supplemental source.
516fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
517    // Try the bare name first (works when git is on PATH), then fall back to
518    // absolute paths for service accounts that run with a stripped PATH.
519    // Unix paths silently fail on Windows and vice-versa.
520    let candidates: &[&str] = &[
521        // Works on all platforms when git is on PATH
522        "git",
523        // Common Linux / macOS install locations
524        "/usr/bin/git",
525        "/usr/local/bin/git",
526        "/opt/homebrew/bin/git",
527        // Git for Windows default installation paths
528        r"C:\Program Files\Git\cmd\git.exe",
529        r"C:\Program Files\Git\bin\git.exe",
530        r"C:\Program Files (x86)\Git\cmd\git.exe",
531    ];
532    for &exe in candidates {
533        let result = std::process::Command::new(exe)
534            .args(["-c", "safe.directory=*"])
535            .args(args)
536            .current_dir(dir)
537            .output()
538            .ok()
539            .filter(|o| o.status.success())
540            .and_then(|o| String::from_utf8(o.stdout).ok())
541            .map(|s| s.trim().to_string())
542            .filter(|s| !s.is_empty());
543        if result.is_some() {
544            return result;
545        }
546    }
547    None
548}
549
550/// Return the name of the CI system if the process is running inside one.
551fn detect_ci_system() -> Option<&'static str> {
552    let ev = |k: &str| std::env::var(k).is_ok();
553    let ev_true = |k: &str| std::env::var(k).as_deref() == Ok("true");
554    if ev("JENKINS_URL") || ev("JENKINS_HOME") || ev("BUILD_URL") {
555        return Some("Jenkins");
556    }
557    if ev_true("GITHUB_ACTIONS") {
558        return Some("GitHub Actions");
559    }
560    if ev_true("GITLAB_CI") {
561        return Some("GitLab CI");
562    }
563    if ev_true("CIRCLECI") {
564        return Some("CircleCI");
565    }
566    if ev_true("TRAVIS") {
567        return Some("Travis CI");
568    }
569    if ev_true("TF_BUILD") {
570        return Some("Azure DevOps");
571    }
572    if ev("TEAMCITY_VERSION") {
573        return Some("TeamCity");
574    }
575    None
576}
577
578/// Read the current branch name from well-known CI environment variables.
579/// Called as a fallback when the git HEAD is detached (common in CI checkouts).
580fn ci_branch_from_env() -> Option<String> {
581    const VARS: &[&str] = &[
582        "BRANCH_NAME",        // Jenkins Pipeline
583        "GIT_BRANCH",         // Jenkins Freestyle (may carry "origin/<branch>")
584        "GITHUB_REF_NAME",    // GitHub Actions
585        "CI_COMMIT_BRANCH",   // GitLab CI
586        "CIRCLE_BRANCH",      // CircleCI
587        "TRAVIS_BRANCH",      // Travis CI
588        "BUILD_SOURCEBRANCH", // Azure DevOps (may carry "refs/heads/<branch>")
589    ];
590    for &var in VARS {
591        if let Ok(val) = std::env::var(var) {
592            let val = val.trim();
593            let val = val
594                .strip_prefix("refs/heads/")
595                .or_else(|| val.strip_prefix("origin/"))
596                .unwrap_or(val);
597            if !val.is_empty() && val != "HEAD" {
598                return Some(val.to_string());
599            }
600        }
601    }
602    None
603}
604
605fn get_current_username() -> String {
606    std::env::var("USERNAME")
607        .or_else(|_| std::env::var("USER"))
608        .unwrap_or_else(|_| "unknown".to_string())
609}
610
611fn non_empty_env(var: &str) -> Option<String> {
612    let v = std::env::var(var).ok()?;
613    if v.is_empty() {
614        None
615    } else {
616        Some(v)
617    }
618}
619
620fn is_jenkins_env() -> bool {
621    std::env::var("JENKINS_URL").is_ok()
622        || std::env::var("JENKINS_HOME").is_ok()
623        || std::env::var("BUILD_URL").is_ok()
624}
625
626fn get_hostname() -> String {
627    // In CI environments prefer a human-readable agent/runner identifier over
628    // whatever hostname the container was assigned.
629    if is_jenkins_env() {
630        if let Some(n) = non_empty_env("NODE_NAME") {
631            return n;
632        }
633    }
634    if std::env::var("GITHUB_ACTIONS").as_deref() == Ok("true") {
635        if let Some(r) = non_empty_env("RUNNER_NAME") {
636            return r;
637        }
638    }
639    if std::env::var("GITLAB_CI").as_deref() == Ok("true") {
640        if let Some(r) = non_empty_env("CI_RUNNER_DESCRIPTION") {
641            return r;
642        }
643    }
644    std::env::var("COMPUTERNAME")
645        .or_else(|_| std::env::var("HOSTNAME"))
646        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
647        .unwrap_or_else(|_| "unknown".to_string())
648}
649
650/// Walk a single directory root and collect file records into the output vectors.
651#[allow(clippy::too_many_arguments)]
652fn walk_root(
653    root: &Path,
654    config: &AppConfig,
655    include_globs: Option<&GlobSet>,
656    exclude_globs: Option<&GlobSet>,
657    enabled_languages: Option<&BTreeSet<Language>>,
658    seen_paths: &mut HashSet<PathBuf>,
659    analyzed: &mut Vec<FileRecord>,
660    skipped: &mut Vec<FileRecord>,
661    warnings: &mut Vec<String>,
662    cancel: Option<&AtomicBool>,
663    progress: Option<&ProgressCounters>,
664) -> Result<()> {
665    let mut builder = WalkBuilder::new(root);
666    builder
667        .follow_links(config.discovery.follow_symlinks)
668        .hidden(config.discovery.ignore_hidden_files)
669        .ignore(config.discovery.honor_ignore_files)
670        .parents(config.discovery.honor_ignore_files)
671        .git_ignore(config.discovery.honor_ignore_files)
672        .git_global(config.discovery.honor_ignore_files)
673        .git_exclude(config.discovery.honor_ignore_files);
674
675    let paths = collect_walk_paths(&builder, seen_paths, warnings);
676    if paths.is_empty() {
677        return Ok(());
678    }
679
680    if let Some(p) = progress {
681        p.files_total.fetch_add(paths.len(), Ordering::Relaxed);
682    }
683
684    let chunk_results = run_parallel_analysis(
685        &paths,
686        root,
687        config,
688        include_globs,
689        exclude_globs,
690        enabled_languages,
691        cancel,
692        progress,
693    )?;
694    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
695}
696
697fn collect_walk_paths(
698    builder: &WalkBuilder,
699    seen_paths: &mut HashSet<PathBuf>,
700    warnings: &mut Vec<String>,
701) -> Vec<PathBuf> {
702    // build_parallel() walks the directory tree across multiple threads (work-stealing
703    // internally), which is meaningfully faster for deeply nested repos with many directories.
704    // We collect results via an MPSC channel so each walker thread sends without contention.
705    let (tx, rx) = std::sync::mpsc::channel::<std::result::Result<PathBuf, String>>();
706
707    builder.build_parallel().run(|| {
708        let tx = tx.clone();
709        Box::new(move |entry| {
710            match entry {
711                Err(e) => {
712                    let _ = tx.send(Err(format!("discovery warning: {e}")));
713                }
714                Ok(e) => {
715                    let path = e.into_path();
716                    if !path.is_dir() {
717                        let _ = tx.send(Ok(path));
718                    }
719                }
720            }
721            ignore::WalkState::Continue
722        })
723    });
724
725    // Drop the sender that the outer scope holds; the per-thread clones were dropped when
726    // run() returned (all threads finished). Dropping this last sender closes the channel.
727    drop(tx);
728
729    rx.into_iter()
730        .filter_map(|msg| match msg {
731            Ok(path) => {
732                if seen_paths.insert(path.clone()) {
733                    Some(path)
734                } else {
735                    None
736                }
737            }
738            Err(warn) => {
739                warnings.push(warn);
740                None
741            }
742        })
743        .collect()
744}
745
746/// Inner work loop executed by each analysis thread.
747#[allow(clippy::too_many_arguments)]
748fn worker_loop(
749    paths: &[PathBuf],
750    root: &Path,
751    config: &AppConfig,
752    include_globs: Option<&GlobSet>,
753    exclude_globs: Option<&GlobSet>,
754    enabled_languages: Option<&BTreeSet<Language>>,
755    cancel: Option<&AtomicBool>,
756    next_index: &AtomicUsize,
757    files_done: Option<&AtomicUsize>,
758) -> Vec<Result<Option<FileRecord>>> {
759    let mut results = Vec::new();
760    loop {
761        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
762            results.push(Err(anyhow::anyhow!("analysis cancelled")));
763            break;
764        }
765        let i = next_index.fetch_add(1, Ordering::Relaxed);
766        if i >= paths.len() {
767            break;
768        }
769        results.push(analyze_candidate_file(
770            &paths[i],
771            root,
772            config,
773            include_globs,
774            exclude_globs,
775            enabled_languages,
776        ));
777        if let Some(fd) = files_done {
778            fd.fetch_add(1, Ordering::Relaxed);
779        }
780    }
781    results
782}
783
784#[allow(clippy::too_many_arguments)]
785fn run_parallel_analysis(
786    paths: &[PathBuf],
787    root: &Path,
788    config: &AppConfig,
789    include_globs: Option<&GlobSet>,
790    exclude_globs: Option<&GlobSet>,
791    enabled_languages: Option<&BTreeSet<Language>>,
792    cancel: Option<&AtomicBool>,
793    progress: Option<&ProgressCounters>,
794) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
795    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
796        n.get().min(MAX_ANALYSIS_THREADS)
797    });
798    // Shared work-queue index: each thread atomically claims the next path to process.
799    // This eliminates static-chunk load imbalance — threads that finish early immediately
800    // pick up more work instead of sitting idle while one overloaded chunk finishes.
801    let next_index = AtomicUsize::new(0);
802    let files_done: Option<&AtomicUsize> = progress.map(|p| p.files_done.as_ref());
803
804    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
805        // IMPORTANT: collect ALL handles before joining any of them.
806        // A lazy spawn-then-join chain would serialize threads one at a time.
807        let mut handles = Vec::with_capacity(thread_count);
808        for _ in 0..thread_count {
809            handles.push(s.spawn(|| {
810                worker_loop(
811                    paths,
812                    root,
813                    config,
814                    include_globs,
815                    exclude_globs,
816                    enabled_languages,
817                    cancel,
818                    &next_index,
819                    files_done,
820                )
821            }));
822        }
823        handles
824            .into_iter()
825            .map(|h| {
826                h.join()
827                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
828            })
829            .collect()
830    })
831}
832
833fn merge_chunk_results(
834    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
835    analyzed: &mut Vec<FileRecord>,
836    skipped: &mut Vec<FileRecord>,
837    warnings: &mut Vec<String>,
838) -> Result<()> {
839    for chunk in chunk_results {
840        for result in chunk {
841            if let Some(record) = result? {
842                push_record(record, analyzed, skipped, warnings);
843            }
844        }
845    }
846    Ok(())
847}
848
849/// Label each analyzed file with its submodule and build per-submodule summaries.
850fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
851    let root = config.discovery.root_paths[0]
852        .canonicalize()
853        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
854    let submodules = detect_submodules(&root);
855    if submodules.is_empty() {
856        return Vec::new();
857    }
858
859    for file in analyzed.iter_mut() {
860        for (name, sub_path) in &submodules {
861            let prefix = sub_path.to_string_lossy().replace('\\', "/");
862            let rel = &file.relative_path;
863            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
864                file.submodule = Some(name.clone());
865                break;
866            }
867        }
868    }
869
870    build_submodule_summaries(analyzed, &submodules)
871}
872
873/// Assemble the final `AnalysisRun` from collected records and metadata.
874fn assemble_run(
875    config: &AppConfig,
876    runtime_mode: &str,
877    analyzed: Vec<FileRecord>,
878    skipped: Vec<FileRecord>,
879    warnings: Vec<String>,
880    submodule_summaries: Vec<SubmoduleSummary>,
881) -> AnalysisRun {
882    let summary = build_summary(&analyzed, &skipped);
883    let language_summaries = build_language_summaries(&analyzed);
884    let col_threshold = config.analysis.style_col_threshold;
885    let style_summary = build_style_summary(&analyzed, col_threshold);
886
887    let first_root = config
888        .discovery
889        .root_paths
890        .first()
891        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
892    let git = first_root
893        .as_deref()
894        .map(detect_git_for_run)
895        .unwrap_or_default();
896
897    let now = Utc::now();
898    let run_id = {
899        let uuid_suffix = Uuid::new_v4().simple().to_string();
900        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
901    };
902
903    AnalysisRun {
904        tool: ToolMetadata {
905            name: "sloc".into(),
906            version: env!("CARGO_PKG_VERSION").into(),
907            run_id,
908            timestamp_utc: now,
909        },
910        environment: EnvironmentMetadata {
911            operating_system: std::env::consts::OS.into(),
912            architecture: std::env::consts::ARCH.into(),
913            runtime_mode: runtime_mode.into(),
914            initiator_username: get_current_username(),
915            initiator_hostname: get_hostname(),
916            ci_name: if is_jenkins_env() {
917                Some(format!("Jenkins\t{}", get_hostname()))
918            } else {
919                detect_ci_system().map(str::to_string)
920            },
921        },
922        effective_configuration: config.clone(),
923        input_roots: config
924            .discovery
925            .root_paths
926            .iter()
927            .map(|p| path_to_string(p))
928            .collect(),
929        summary_totals: summary,
930        totals_by_language: language_summaries,
931        per_file_records: analyzed,
932        skipped_file_records: skipped,
933        warnings,
934        submodule_summaries,
935        git_commit_short: git.commit_short,
936        git_commit_long: git.commit_long,
937        git_branch: git.branch,
938        git_commit_author: git.author,
939        git_tags: git.tags,
940        git_nearest_tag: git.nearest_tag,
941        git_commit_date: git.commit_date,
942        git_remote_url: git.remote_url,
943        style_summary,
944    }
945}
946
947/// # Errors
948///
949/// Returns an error if the config is invalid, root paths cannot be walked, or any file
950/// analysis step fails in a way that cannot be recovered from.
951#[allow(clippy::too_many_lines)]
952pub fn analyze(
953    config: &AppConfig,
954    runtime_mode: &str,
955    cancel: Option<&AtomicBool>,
956    progress: Option<&ProgressCounters>,
957) -> Result<AnalysisRun> {
958    config.validate()?;
959
960    if config.discovery.root_paths.is_empty() {
961        anyhow::bail!("no input paths were provided");
962    }
963
964    let include_globs = compile_globset(&config.discovery.include_globs)?;
965    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
966    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
967
968    let mut analyzed = Vec::new();
969    let mut skipped = Vec::new();
970    let mut warnings = Vec::new();
971    let mut seen_paths = HashSet::new();
972
973    for root in &config.discovery.root_paths {
974        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
975            anyhow::bail!("analysis cancelled");
976        }
977
978        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
979
980        if root.is_file() {
981            if let Some(record) = analyze_candidate_file(
982                &root,
983                root.parent().unwrap_or_else(|| Path::new(".")),
984                config,
985                include_globs.as_ref(),
986                exclude_globs.as_ref(),
987                enabled_languages.as_ref(),
988            )? {
989                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
990            }
991            continue;
992        }
993
994        walk_root(
995            &root,
996            config,
997            include_globs.as_ref(),
998            exclude_globs.as_ref(),
999            enabled_languages.as_ref(),
1000            &mut seen_paths,
1001            &mut analyzed,
1002            &mut skipped,
1003            &mut warnings,
1004            cancel,
1005            progress,
1006        )?;
1007    }
1008
1009    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1010    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1011
1012    // Submodule detection: label each file with its submodule and build per-submodule summaries.
1013    let submodule_summaries = if config.discovery.submodule_breakdown {
1014        process_submodules(config, &mut analyzed)
1015    } else {
1016        Vec::new()
1017    };
1018
1019    attach_coverage(config, &mut analyzed, &mut warnings);
1020
1021    Ok(assemble_run(
1022        config,
1023        runtime_mode,
1024        analyzed,
1025        skipped,
1026        warnings,
1027        submodule_summaries,
1028    ))
1029}
1030
1031fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
1032    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
1033    else {
1034        return;
1035    };
1036    tracing::debug!(path = %cov_path.display(), "loading coverage file");
1037    match fs::read_to_string(&cov_path) {
1038        Ok(content) => {
1039            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
1040            let mut matched: u32 = 0;
1041            let mut unmatched: u32 = 0;
1042            for record in analyzed.iter_mut() {
1043                record.coverage =
1044                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
1045                if record.coverage.is_some() {
1046                    matched += 1;
1047                } else {
1048                    unmatched += 1;
1049                }
1050            }
1051            tracing::debug!(
1052                path = %cov_path.display(),
1053                coverage_entries = cov_map.len(),
1054                files_matched = matched,
1055                files_unmatched = unmatched,
1056                "coverage attached"
1057            );
1058            if unmatched > 0 && matched == 0 {
1059                tracing::warn!(
1060                    path = %cov_path.display(),
1061                    "coverage file loaded but no source files could be matched — check that paths in the coverage report match the scanned directory"
1062                );
1063            }
1064        }
1065        Err(e) => {
1066            tracing::warn!(path = %cov_path.display(), error = %e, "coverage file could not be read");
1067            warnings.push(format!(
1068                "coverage file '{}' could not be read: {e}",
1069                cov_path.display()
1070            ));
1071        }
1072    }
1073}
1074
1075fn push_record(
1076    record: FileRecord,
1077    analyzed: &mut Vec<FileRecord>,
1078    skipped: &mut Vec<FileRecord>,
1079    warnings: &mut Vec<String>,
1080) {
1081    warnings.extend(
1082        record
1083            .warnings
1084            .iter()
1085            .map(|warning| format!("{}: {warning}", record.relative_path)),
1086    );
1087
1088    match record.status {
1089        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
1090        _ => skipped.push(record),
1091    }
1092}
1093
1094/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
1095#[inline]
1096fn skip_with_reason(
1097    path: &Path,
1098    root: &Path,
1099    size: u64,
1100    reason: impl Into<String>,
1101) -> MetadataPolicyOutcome {
1102    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
1103        path,
1104        root,
1105        size,
1106        FileStatus::SkippedByPolicy,
1107        vec![reason.into()],
1108    )))
1109}
1110
1111/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
1112/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
1113/// or `Continue` to proceed to content checks.
1114#[allow(clippy::too_many_arguments)]
1115fn check_metadata_policy(
1116    path: &Path,
1117    root: &Path,
1118    relative_path: &str,
1119    metadata: &fs::Metadata,
1120    config: &AppConfig,
1121    include_globs: Option<&GlobSet>,
1122    exclude_globs: Option<&GlobSet>,
1123) -> MetadataPolicyOutcome {
1124    let size = metadata.len();
1125
1126    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
1127        return skip_with_reason(path, root, size, "symlink skipped by policy");
1128    }
1129    if file_name_eq(path, ".gitignore") {
1130        return skip_with_reason(path, root, size, ".gitignore is always excluded");
1131    }
1132    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
1133        return skip_with_reason(path, root, size, "path matched excluded directory setting");
1134    }
1135    if size > config.discovery.max_file_size_bytes {
1136        return skip_with_reason(
1137            path,
1138            root,
1139            size,
1140            format!(
1141                "file exceeded max_file_size_bytes ({})",
1142                config.discovery.max_file_size_bytes
1143            ),
1144        );
1145    }
1146    if let Some(globs) = include_globs {
1147        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
1148            return MetadataPolicyOutcome::Exclude;
1149        }
1150    }
1151    if let Some(globs) = exclude_globs {
1152        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
1153            return skip_with_reason(path, root, size, "path matched exclude glob");
1154        }
1155    }
1156    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
1157        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
1158    }
1159
1160    MetadataPolicyOutcome::Continue
1161}
1162
1163struct ContentPolicyResult {
1164    vendor: bool,
1165    generated: bool,
1166    minified: bool,
1167    skip_record: Option<FileRecord>,
1168}
1169
1170/// Apply content-level policy checks (vendor, generated, minified).
1171/// `skip_record` is `Some` when the file should be skipped.
1172fn check_content_policy(
1173    path: &Path,
1174    root: &Path,
1175    size_bytes: u64,
1176    bytes: &[u8],
1177    config: &AppConfig,
1178) -> ContentPolicyResult {
1179    let vendor = is_vendor_path(path);
1180    if vendor && config.analysis.vendor_directory_detection {
1181        return ContentPolicyResult {
1182            vendor,
1183            generated: false,
1184            minified: false,
1185            skip_record: Some(skipped_record(
1186                path,
1187                root,
1188                size_bytes,
1189                FileStatus::SkippedByPolicy,
1190                vec!["vendor file skipped by policy".into()],
1191            )),
1192        };
1193    }
1194
1195    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
1196    if generated {
1197        return ContentPolicyResult {
1198            vendor,
1199            generated,
1200            minified: false,
1201            skip_record: Some(skipped_record(
1202                path,
1203                root,
1204                size_bytes,
1205                FileStatus::SkippedByPolicy,
1206                vec!["generated file skipped by policy".into()],
1207            )),
1208        };
1209    }
1210
1211    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
1212    if minified {
1213        return ContentPolicyResult {
1214            vendor,
1215            generated,
1216            minified,
1217            skip_record: Some(skipped_record(
1218                path,
1219                root,
1220                size_bytes,
1221                FileStatus::SkippedByPolicy,
1222                vec!["minified file skipped by policy".into()],
1223            )),
1224        };
1225    }
1226
1227    ContentPolicyResult {
1228        vendor,
1229        generated,
1230        minified,
1231        skip_record: None,
1232    }
1233}
1234
1235/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
1236fn decode_file_contents(
1237    path: &Path,
1238    root: &Path,
1239    size_bytes: u64,
1240    bytes: &[u8],
1241    config: &AppConfig,
1242) -> Result<Option<(String, String, Vec<String>)>> {
1243    if is_binary(bytes) {
1244        return match config.analysis.binary_file_behavior {
1245            BinaryFileBehavior::Skip => Ok(None),
1246            BinaryFileBehavior::Fail => {
1247                anyhow::bail!("binary file encountered: {}", path.display())
1248            }
1249        };
1250    }
1251
1252    match decode_bytes(bytes) {
1253        Ok(result) => Ok(Some(result)),
1254        Err(err) => match config.analysis.decode_failure_behavior {
1255            FailureBehavior::WarnSkip => {
1256                // Caller will handle the None as a SkippedDecodeError record.
1257                // We use a sentinel: return Ok(None) but encode the error into a field.
1258                // Instead, propagate as a skipped record via the caller.
1259                let _ = (path, root, size_bytes); // suppress unused warnings
1260                Err(anyhow::anyhow!("__decode_warn__: {err}"))
1261            }
1262            FailureBehavior::Fail => {
1263                anyhow::bail!("decode failure for {}: {err}", path.display())
1264            }
1265        },
1266    }
1267}
1268
1269#[allow(clippy::too_many_lines)]
1270fn analyze_candidate_file(
1271    path: &Path,
1272    root: &Path,
1273    config: &AppConfig,
1274    include_globs: Option<&GlobSet>,
1275    exclude_globs: Option<&GlobSet>,
1276    enabled_languages: Option<&BTreeSet<Language>>,
1277) -> Result<Option<FileRecord>> {
1278    let metadata = match fs::symlink_metadata(path) {
1279        Ok(metadata) => metadata,
1280        Err(err) => {
1281            return Ok(Some(skipped_record(
1282                path,
1283                root,
1284                0,
1285                FileStatus::ErrorInternal,
1286                vec![format!("failed to read metadata: {err}")],
1287            )));
1288        }
1289    };
1290
1291    let relative_path = relative_path_string(path, root);
1292
1293    // Metadata-level policy checks.
1294    match check_metadata_policy(
1295        path,
1296        root,
1297        &relative_path,
1298        &metadata,
1299        config,
1300        include_globs,
1301        exclude_globs,
1302    ) {
1303        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1304        MetadataPolicyOutcome::Exclude => return Ok(None),
1305        MetadataPolicyOutcome::Continue => {}
1306    }
1307
1308    let bytes = match fs::read(path) {
1309        Ok(bytes) => bytes,
1310        Err(err) => {
1311            return Ok(Some(skipped_record(
1312                path,
1313                root,
1314                metadata.len(),
1315                FileStatus::ErrorInternal,
1316                vec![format!("failed to read file: {err}")],
1317            )));
1318        }
1319    };
1320
1321    // Content-level policy checks (vendor, generated, minified).
1322    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1323    if let Some(record) = content_policy.skip_record {
1324        return Ok(Some(record));
1325    }
1326    let (vendor, generated, minified) = (
1327        content_policy.vendor,
1328        content_policy.generated,
1329        content_policy.minified,
1330    );
1331
1332    // Decode content, handling binary and decode failures.
1333    let (text, encoding, decode_warnings) =
1334        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1335            Ok(Some(result)) => result,
1336            Ok(None) => {
1337                return Ok(Some(skipped_record(
1338                    path,
1339                    root,
1340                    metadata.len(),
1341                    FileStatus::SkippedBinary,
1342                    vec!["binary file skipped by default".into()],
1343                )));
1344            }
1345            Err(err) => {
1346                let msg = err.to_string();
1347                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1348                    return Ok(Some(skipped_record(
1349                        path,
1350                        root,
1351                        metadata.len(),
1352                        FileStatus::SkippedDecodeError,
1353                        vec![warn_msg.to_string()],
1354                    )));
1355                }
1356                return Err(err);
1357            }
1358        };
1359
1360    let first_line = text.lines().next();
1361    let language = detect_language(
1362        path,
1363        first_line,
1364        &config.analysis.extension_overrides,
1365        config.analysis.shebang_detection,
1366    );
1367
1368    let Some(language) = language else {
1369        return Ok(Some(skipped_record(
1370            path,
1371            root,
1372            metadata.len(),
1373            FileStatus::SkippedUnsupported,
1374            vec!["unsupported or undetected language".into()],
1375        )));
1376    };
1377
1378    if let Some(enabled) = enabled_languages {
1379        if !enabled.contains(&language) {
1380            return Ok(Some(skipped_record(
1381                path,
1382                root,
1383                metadata.len(),
1384                FileStatus::SkippedByPolicy,
1385                vec![format!(
1386                    "language {} disabled by configuration",
1387                    language.display_name()
1388                )],
1389            )));
1390        }
1391    }
1392
1393    let style_scope = match config.analysis.style_lang_scope.as_str() {
1394        "c_family" => StyleLangScope::CFamilyOnly,
1395        _ => StyleLangScope::All,
1396    };
1397    let ieee_opts = AnalysisOptions {
1398        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1399            == BlankInBlockCommentPolicy::CountAsComment,
1400        collapse_continuation_lines: config.analysis.continuation_line_policy
1401            == ContinuationLinePolicy::CollapseToLogical,
1402        enable_style: config.analysis.style_analysis_enabled,
1403        style_lang_scope: style_scope,
1404    };
1405    let analysis = analyze_text(language, &text, ieee_opts);
1406    let effective_counts = compute_effective_counts(
1407        &analysis.raw,
1408        config.analysis.mixed_line_policy,
1409        config.analysis.python_docstrings_as_comments,
1410        config.analysis.count_compiler_directives,
1411    );
1412
1413    let mut warnings = decode_warnings;
1414    warnings.extend(analysis.warnings.clone());
1415
1416    Ok(Some(FileRecord {
1417        path: path_to_string(path),
1418        relative_path,
1419        language: Some(language),
1420        size_bytes: metadata.len(),
1421        detected_encoding: Some(encoding),
1422        raw_line_categories: analysis.raw,
1423        effective_counts,
1424        status: match analysis.parse_mode {
1425            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1426            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1427        },
1428        warnings,
1429        generated,
1430        minified,
1431        vendor,
1432        parse_mode: Some(analysis.parse_mode),
1433        submodule: None,
1434        coverage: None,
1435        style_analysis: analysis.style_analysis,
1436    }))
1437}
1438
1439const fn compute_effective_counts(
1440    raw: &RawLineCounts,
1441    mixed_line_policy: MixedLinePolicy,
1442    python_docstrings_as_comments: bool,
1443    count_compiler_directives: bool,
1444) -> EffectiveCounts {
1445    let mut effective = EffectiveCounts {
1446        code_lines: raw.code_only_lines,
1447        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1448        blank_lines: raw.blank_only_lines,
1449        mixed_lines_separate: 0,
1450    };
1451
1452    if python_docstrings_as_comments {
1453        effective.comment_lines += raw.docstring_comment_lines;
1454    } else {
1455        effective.code_lines += raw.docstring_comment_lines;
1456    }
1457
1458    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1459    match mixed_line_policy {
1460        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1461        MixedLinePolicy::CodeAndComment => {
1462            effective.code_lines += mixed_total;
1463            effective.comment_lines += mixed_total;
1464        }
1465        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1466        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1467    }
1468
1469    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1470    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1471    if !count_compiler_directives {
1472        effective.code_lines = effective
1473            .code_lines
1474            .saturating_sub(raw.compiler_directive_lines);
1475    }
1476
1477    effective
1478}
1479
1480fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1481    let mut summary = SummaryTotals {
1482        files_considered: (analyzed.len() + skipped.len()) as u64,
1483        files_analyzed: analyzed.len() as u64,
1484        files_skipped: skipped.len() as u64,
1485        ..Default::default()
1486    };
1487
1488    for record in analyzed {
1489        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1490        summary.code_lines += record.effective_counts.code_lines;
1491        summary.comment_lines += record.effective_counts.comment_lines;
1492        summary.blank_lines += record.effective_counts.blank_lines;
1493        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1494        summary.functions += record.raw_line_categories.functions;
1495        summary.classes += record.raw_line_categories.classes;
1496        summary.variables += record.raw_line_categories.variables;
1497        summary.imports += record.raw_line_categories.imports;
1498        summary.test_count += record.raw_line_categories.test_count;
1499        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1500        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1501        if let Some(cov) = &record.coverage {
1502            summary.coverage_lines_found += u64::from(cov.lines_found);
1503            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1504            summary.coverage_functions_found += u64::from(cov.functions_found);
1505            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1506            summary.coverage_branches_found += u64::from(cov.branches_found);
1507            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1508        }
1509    }
1510
1511    summary
1512}
1513
1514/// Construct a zero-filled `LanguageSummary` for the given language.
1515const fn zeroed_summary(language: Language) -> LanguageSummary {
1516    LanguageSummary {
1517        language,
1518        files: 0,
1519        total_physical_lines: 0,
1520        code_lines: 0,
1521        comment_lines: 0,
1522        blank_lines: 0,
1523        mixed_lines_separate: 0,
1524        functions: 0,
1525        classes: 0,
1526        variables: 0,
1527        imports: 0,
1528        test_count: 0,
1529        test_assertion_count: 0,
1530        test_suite_count: 0,
1531        coverage_lines_found: 0,
1532        coverage_lines_hit: 0,
1533        coverage_functions_found: 0,
1534        coverage_functions_hit: 0,
1535        coverage_branches_found: 0,
1536        coverage_branches_hit: 0,
1537    }
1538}
1539
1540/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1541fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1542    entry.files += 1;
1543    let r = &record.raw_line_categories;
1544    entry.total_physical_lines += r.total_physical_lines;
1545    entry.code_lines += record.effective_counts.code_lines;
1546    entry.comment_lines += record.effective_counts.comment_lines;
1547    entry.blank_lines += record.effective_counts.blank_lines;
1548    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1549    entry.functions += r.functions;
1550    entry.classes += r.classes;
1551    entry.variables += r.variables;
1552    entry.imports += r.imports;
1553    entry.test_count += r.test_count;
1554    entry.test_assertion_count += r.test_assertion_count;
1555    entry.test_suite_count += r.test_suite_count;
1556    if let Some(cov) = &record.coverage {
1557        entry.coverage_lines_found += u64::from(cov.lines_found);
1558        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1559        entry.coverage_functions_found += u64::from(cov.functions_found);
1560        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1561        entry.coverage_branches_found += u64::from(cov.branches_found);
1562        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1563    }
1564}
1565
1566fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1567    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1568    for record in analyzed {
1569        let Some(language) = record.language else {
1570            continue;
1571        };
1572        let entry = by_language
1573            .entry(language)
1574            .or_insert_with(|| zeroed_summary(language));
1575        accumulate_record_into_summary(entry, record);
1576    }
1577    by_language.into_values().collect()
1578}
1579
1580fn skipped_record(
1581    path: &Path,
1582    root: &Path,
1583    size_bytes: u64,
1584    status: FileStatus,
1585    warnings: Vec<String>,
1586) -> FileRecord {
1587    FileRecord {
1588        path: path_to_string(path),
1589        relative_path: relative_path_string(path, root),
1590        language: None,
1591        size_bytes,
1592        detected_encoding: None,
1593        raw_line_categories: RawLineCounts::default(),
1594        effective_counts: EffectiveCounts::default(),
1595        status,
1596        warnings,
1597        generated: false,
1598        minified: false,
1599        vendor: false,
1600        parse_mode: None,
1601        submodule: None,
1602        coverage: None,
1603        style_analysis: None,
1604    }
1605}
1606
1607fn relative_path_string(path: &Path, root: &Path) -> String {
1608    path.strip_prefix(root)
1609        .unwrap_or(path)
1610        .to_string_lossy()
1611        .replace('\\', "/")
1612}
1613
1614fn path_to_string(path: &Path) -> String {
1615    path.to_string_lossy().replace('\\', "/")
1616}
1617
1618/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1619#[must_use]
1620pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1621    let gitmodules = root.join(".gitmodules");
1622    if !gitmodules.is_file() {
1623        return Vec::new();
1624    }
1625    let Ok(content) = fs::read_to_string(&gitmodules) else {
1626        return Vec::new();
1627    };
1628
1629    let mut result = Vec::new();
1630    let mut current_name: Option<String> = None;
1631    let mut current_path: Option<PathBuf> = None;
1632
1633    for line in content.lines() {
1634        let trimmed = line.trim();
1635        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1636            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1637                result.push((name, path));
1638            }
1639            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1640            current_name = Some(name);
1641        } else if let Some(rest) = trimmed.strip_prefix("path") {
1642            if let Some(eq_pos) = rest.find('=') {
1643                let path_str = rest[eq_pos + 1..].trim();
1644                current_path = Some(PathBuf::from(path_str));
1645            }
1646        }
1647    }
1648    if let (Some(name), Some(path)) = (current_name, current_path) {
1649        result.push((name, path));
1650    }
1651
1652    result
1653}
1654
1655fn build_submodule_summaries(
1656    analyzed: &[FileRecord],
1657    submodules: &[(String, PathBuf)],
1658) -> Vec<SubmoduleSummary> {
1659    submodules
1660        .iter()
1661        .map(|(name, path)| {
1662            let files: Vec<&FileRecord> = analyzed
1663                .iter()
1664                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1665                .collect();
1666
1667            let files_analyzed = files.len() as u64;
1668            let total_physical_lines = files
1669                .iter()
1670                .map(|f| f.raw_line_categories.total_physical_lines)
1671                .sum();
1672            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1673            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1674            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1675            let language_summaries = build_language_summaries_from_slice(&files);
1676
1677            SubmoduleSummary {
1678                name: name.clone(),
1679                relative_path: path.to_string_lossy().replace('\\', "/"),
1680                files_analyzed,
1681                total_physical_lines,
1682                code_lines,
1683                comment_lines,
1684                blank_lines,
1685                language_summaries,
1686            }
1687        })
1688        .filter(|s| s.files_analyzed > 0)
1689        .collect()
1690}
1691
1692/// Dominant indent label from vote counts.
1693#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1694fn dominant_indent_label(files: &[&StyleAnalysis]) -> String {
1695    let mut votes = [0u32; 6];
1696    for f in files {
1697        let idx = match f.indent_style {
1698            IndentStyle::Tabs => 0,
1699            IndentStyle::Spaces2 => 1,
1700            IndentStyle::Spaces4 => 2,
1701            IndentStyle::Spaces8 => 3,
1702            IndentStyle::Mixed => 4,
1703            IndentStyle::Unknown => 5,
1704        };
1705        votes[idx] += 1;
1706    }
1707    let labels = ["Tabs", "2-Space", "4-Space", "8-Space", "Mixed", "\u{2014}"];
1708    labels[votes
1709        .iter()
1710        .enumerate()
1711        .max_by_key(|(_, v)| *v)
1712        .map(|(i, _)| i)
1713        .unwrap_or(5)]
1714    .to_string()
1715}
1716
1717/// Line-80 compliance percentage for a slice of style analyses.
1718#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1719fn line80_pct(files: &[&StyleAnalysis]) -> u8 {
1720    if files.is_empty() {
1721        return 0;
1722    }
1723    let compliant = files
1724        .iter()
1725        .filter(|f| f.total_lines == 0 || (f.lines_over_80 as f32 / f.total_lines as f32) <= 0.05)
1726        .count() as u32;
1727    ((compliant * 100) / files.len() as u32) as u8
1728}
1729
1730/// Column-N compliance percentage using the configured threshold (80, 100, or 120).
1731/// Falls back to the 80-col bucket for any threshold ≤ 80.
1732#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1733fn line_col_pct(files: &[&StyleAnalysis], threshold: u16) -> u8 {
1734    if files.is_empty() {
1735        return 0;
1736    }
1737    let compliant = files
1738        .iter()
1739        .filter(|f| {
1740            let over = if threshold <= 80 {
1741                f.lines_over_80
1742            } else if threshold <= 100 {
1743                f.lines_over_100
1744            } else {
1745                f.lines_over_120
1746            };
1747            f.total_lines == 0 || (over as f32 / f.total_lines as f32) <= 0.05
1748        })
1749        .count() as u32;
1750    ((compliant * 100) / files.len() as u32) as u8
1751}
1752
1753/// Build a `LanguageStyleGroup` from a non-empty slice of `StyleAnalysis` for one family.
1754#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1755fn build_language_group(
1756    family: &str,
1757    files: &[&StyleAnalysis],
1758    col_threshold: u16,
1759) -> LanguageStyleGroup {
1760    let count = files.len() as u32;
1761
1762    // Collect every unique guide name across all files in this group.
1763    let mut all_names: Vec<String> = Vec::new();
1764    for f in files {
1765        for g in &f.guide_scores {
1766            if !all_names.contains(&g.name) {
1767                all_names.push(g.name.clone());
1768            }
1769        }
1770    }
1771
1772    let mut guide_avg_scores: Vec<(String, u8)> = all_names
1773        .into_iter()
1774        .map(|name| {
1775            let sum: u32 = files
1776                .iter()
1777                .filter_map(|f| f.guide_scores.iter().find(|g| g.name == name))
1778                .map(|g| u32::from(g.score_pct))
1779                .sum();
1780            let avg = (sum / count) as u8;
1781            (name, avg)
1782        })
1783        .collect();
1784    guide_avg_scores.sort_by_key(|s| std::cmp::Reverse(s.1));
1785
1786    let (dominant_guide, dominant_score_pct) = guide_avg_scores
1787        .first()
1788        .map(|(n, s)| (n.clone(), *s))
1789        .unwrap_or_default();
1790
1791    let lcp = line_col_pct(files, col_threshold);
1792    LanguageStyleGroup {
1793        language_family: family.to_string(),
1794        files_count: count,
1795        dominant_guide,
1796        dominant_score_pct,
1797        common_indent_style: dominant_indent_label(files),
1798        guide_avg_scores,
1799        line80_compliant_pct: line80_pct(files),
1800        line_col_compliant_pct: lcp,
1801    }
1802}
1803
1804/// Build aggregate multi-language style-guide adherence.
1805/// Returns `None` when no files had style data.
1806#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1807fn build_style_summary(analyzed: &[FileRecord], col_threshold: u16) -> Option<StyleSummary> {
1808    let all_style: Vec<&StyleAnalysis> = analyzed
1809        .iter()
1810        .filter_map(|f| f.style_analysis.as_ref())
1811        .collect();
1812
1813    if all_style.is_empty() {
1814        return None;
1815    }
1816
1817    // Group by language_family.
1818    let mut families: std::collections::BTreeMap<&str, Vec<&StyleAnalysis>> =
1819        std::collections::BTreeMap::new();
1820    for sa in &all_style {
1821        families
1822            .entry(sa.language_family.as_str())
1823            .or_default()
1824            .push(sa);
1825    }
1826
1827    let mut by_language: Vec<LanguageStyleGroup> = families
1828        .iter()
1829        .map(|(family, files)| build_language_group(family, files, col_threshold))
1830        .collect();
1831    by_language.sort_by_key(|g| std::cmp::Reverse(g.files_count));
1832
1833    let files_analyzed = all_style.len() as u32;
1834    let common_indent_style = dominant_indent_label(&all_style);
1835    let line80_compliant_pct = line80_pct(&all_style);
1836    let line_col_compliant_pct = line_col_pct(&all_style, col_threshold);
1837
1838    Some(StyleSummary {
1839        files_analyzed,
1840        common_indent_style,
1841        line80_compliant_pct,
1842        line_col_compliant_pct,
1843        col_threshold,
1844        by_language,
1845    })
1846}
1847
1848fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1849    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1850    for file in files {
1851        let Some(lang) = file.language else { continue };
1852        let entry = map
1853            .entry(lang.display_name().to_string())
1854            .or_insert_with(|| zeroed_summary(lang));
1855        accumulate_record_into_summary(entry, file);
1856    }
1857    map.into_values().collect()
1858}
1859
1860fn file_name_eq(path: &Path, expected: &str) -> bool {
1861    path.file_name()
1862        .and_then(|name| name.to_str())
1863        .is_some_and(|name| name == expected)
1864}
1865
1866fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1867    path.components().any(|component| {
1868        component
1869            .as_os_str()
1870            .to_str()
1871            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1872    })
1873}
1874
1875fn is_vendor_path(path: &Path) -> bool {
1876    path.components().any(|component| {
1877        component
1878            .as_os_str()
1879            .to_str()
1880            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1881    })
1882}
1883
1884fn is_known_lockfile(path: &Path) -> bool {
1885    path.file_name()
1886        .and_then(|name| name.to_str())
1887        .is_some_and(|name| {
1888            matches!(
1889                name,
1890                "Cargo.lock"
1891                    | "package-lock.json"
1892                    | "yarn.lock"
1893                    | "pnpm-lock.yaml"
1894                    | "Pipfile.lock"
1895                    | "poetry.lock"
1896                    | "composer.lock"
1897            )
1898        })
1899}
1900
1901fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1902    let file_name = path
1903        .file_name()
1904        .and_then(|name| name.to_str())
1905        .unwrap_or_default();
1906    if file_name.contains(".generated.") || file_name.contains(".g.") {
1907        return true;
1908    }
1909
1910    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1911        .to_ascii_lowercase();
1912    sample.contains("@generated") || sample.contains("generated by")
1913}
1914
1915fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1916    let file_name = path
1917        .file_name()
1918        .and_then(|name| name.to_str())
1919        .unwrap_or_default();
1920    if file_name.contains(".min.") {
1921        return true;
1922    }
1923
1924    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1925    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1926    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1927    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1928}
1929
1930fn is_binary(bytes: &[u8]) -> bool {
1931    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1932        || bytes.starts_with(&[0xFF, 0xFE])
1933        || bytes.starts_with(&[0xFE, 0xFF])
1934    {
1935        return false;
1936    }
1937
1938    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1939    sample.contains(&0)
1940}
1941
1942/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
1943/// Returns `(text, encoding_label, warnings)`.
1944fn decode_utf16_bom(
1945    bom_stripped: &[u8],
1946    encoding: &'static encoding_rs::Encoding,
1947    label: &str,
1948) -> (String, String, Vec<String>) {
1949    let (cow, _, had_errors) = encoding.decode(bom_stripped);
1950    let mut warnings = Vec::new();
1951    if had_errors {
1952        warnings.push(format!("{label} decode contained replacement characters"));
1953    }
1954    (cow.into_owned(), label.into(), warnings)
1955}
1956
1957fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1958    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1959        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1960        return Ok((text, "utf-8-bom".into(), vec![]));
1961    }
1962    if bytes.starts_with(&[0xFF, 0xFE]) {
1963        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1964    }
1965    if bytes.starts_with(&[0xFE, 0xFF]) {
1966        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1967    }
1968
1969    // Multiple statements in the else branch make map_or_else awkward here.
1970    #[allow(clippy::option_if_let_else)]
1971    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1972        Ok((text, "utf-8".into(), vec![]))
1973    } else {
1974        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1975        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1976        if had_errors {
1977            warnings.push("fallback decode contained replacement characters".into());
1978        }
1979        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1980    }
1981}
1982
1983fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1984    if patterns.is_empty() {
1985        return Ok(None);
1986    }
1987
1988    let mut builder = GlobSetBuilder::new();
1989    for pattern in patterns {
1990        builder
1991            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1992    }
1993    Ok(Some(
1994        builder.build().context("failed to compile glob filters")?,
1995    ))
1996}
1997
1998fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1999    if enabled.is_empty() {
2000        return Ok(None);
2001    }
2002
2003    let supported = supported_languages();
2004    let mut set = BTreeSet::new();
2005    for name in enabled {
2006        let language = Language::from_name(name)
2007            .with_context(|| format!("unsupported language in config: {name}"))?;
2008        if !supported.contains(&language) {
2009            anyhow::bail!("language {name} is not supported in this build");
2010        }
2011        set.insert(language);
2012    }
2013    Ok(Some(set))
2014}
2015
2016/// # Errors
2017///
2018/// Returns an error if serialization fails or the output file cannot be written.
2019pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
2020    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
2021    fs::write(output_path, json)
2022        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
2023}
2024
2025/// # Errors
2026///
2027/// Returns an error if the file cannot be read or the JSON cannot be parsed.
2028pub fn read_json(path: &Path) -> Result<AnalysisRun> {
2029    let contents = fs::read_to_string(path)
2030        .with_context(|| format!("failed to read result file {}", path.display()))?;
2031    serde_json::from_str(&contents)
2032        .with_context(|| format!("failed to parse JSON result {}", path.display()))
2033}
2034
2035#[cfg(test)]
2036mod tests {
2037    use super::*;
2038
2039    #[test]
2040    fn effective_counts_respect_code_only_policy() {
2041        let raw = RawLineCounts {
2042            code_only_lines: 2,
2043            single_comment_only_lines: 1,
2044            mixed_code_single_comment_lines: 3,
2045            docstring_comment_lines: 2,
2046            ..RawLineCounts::default()
2047        };
2048        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
2049        assert_eq!(counts.code_lines, 5);
2050        assert_eq!(counts.comment_lines, 3);
2051    }
2052
2053    #[test]
2054    fn effective_counts_can_separate_mixed() {
2055        let raw = RawLineCounts {
2056            mixed_code_single_comment_lines: 2,
2057            mixed_code_multi_comment_lines: 1,
2058            ..RawLineCounts::default()
2059        };
2060        let counts =
2061            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
2062        assert_eq!(counts.mixed_lines_separate, 3);
2063        assert_eq!(counts.code_lines, 0);
2064        assert_eq!(counts.comment_lines, 0);
2065    }
2066
2067    #[test]
2068    fn windows_1252_fallback_decodes() {
2069        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
2070        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
2071        assert_eq!(encoding, "windows-1252");
2072        assert!(text.contains('–'));
2073        assert!(!warnings.is_empty());
2074    }
2075
2076    // ── Pure predicate tests ─────────────────────────────────────────────────
2077
2078    #[test]
2079    fn is_binary_detects_null_byte() {
2080        let bytes = b"hello\x00world";
2081        assert!(is_binary(bytes));
2082    }
2083
2084    #[test]
2085    fn is_binary_clean_text_is_not_binary() {
2086        let bytes = b"fn main() { println!(\"hello\"); }";
2087        assert!(!is_binary(bytes));
2088    }
2089
2090    #[test]
2091    fn is_binary_utf8_bom_not_binary() {
2092        let bytes = b"\xef\xbb\xbffn main() {}";
2093        assert!(!is_binary(bytes));
2094    }
2095
2096    #[test]
2097    fn looks_generated_at_generated_marker() {
2098        let bytes = b"// @generated by protoc-gen-rust\nfn foo() {}";
2099        assert!(looks_generated(Path::new("foo.rs"), bytes));
2100    }
2101
2102    #[test]
2103    fn looks_generated_do_not_edit_marker() {
2104        // "Code generated by" triggers detection (contains the "generated by" substring).
2105        let bytes = b"// Code generated by build.rs. DO NOT EDIT.\nuse foo;";
2106        assert!(looks_generated(Path::new("foo.rs"), bytes));
2107        // @generated also triggers detection independently.
2108        let bytes2 = b"// @generated\nuse foo;";
2109        assert!(looks_generated(Path::new("foo.rs"), bytes2));
2110    }
2111
2112    #[test]
2113    fn looks_generated_normal_file_not_generated() {
2114        let bytes = b"fn main() {\n    println!(\"hello\");\n}\n";
2115        assert!(!looks_generated(Path::new("main.rs"), bytes));
2116    }
2117
2118    #[test]
2119    fn looks_minified_dot_min_filename() {
2120        let bytes = b"function a(){return 1}";
2121        assert!(looks_minified(Path::new("bundle.min.js"), bytes));
2122    }
2123
2124    #[test]
2125    fn looks_minified_normal_file_not_minified() {
2126        let bytes = b"function hello() {\n    return 1;\n}\n";
2127        assert!(!looks_minified(Path::new("app.js"), bytes));
2128    }
2129
2130    #[test]
2131    fn looks_minified_very_long_line() {
2132        let long_line: Vec<u8> = b"x".repeat(MINIFIED_LINE_THRESHOLD + 1);
2133        assert!(looks_minified(Path::new("app.js"), &long_line));
2134    }
2135
2136    #[test]
2137    fn is_known_lockfile_cargo_lock() {
2138        assert!(is_known_lockfile(Path::new("Cargo.lock")));
2139    }
2140
2141    #[test]
2142    fn is_known_lockfile_package_lock_json() {
2143        assert!(is_known_lockfile(Path::new("package-lock.json")));
2144    }
2145
2146    #[test]
2147    fn is_known_lockfile_yarn_lock() {
2148        assert!(is_known_lockfile(Path::new("yarn.lock")));
2149    }
2150
2151    #[test]
2152    fn is_known_lockfile_normal_file_is_not_lockfile() {
2153        assert!(!is_known_lockfile(Path::new("src/lib.rs")));
2154    }
2155
2156    #[test]
2157    fn is_vendor_path_node_modules() {
2158        assert!(is_vendor_path(Path::new("node_modules/react/index.js")));
2159    }
2160
2161    #[test]
2162    fn is_vendor_path_vendor_dir() {
2163        assert!(is_vendor_path(Path::new("vendor/anyhow/src/lib.rs")));
2164    }
2165
2166    #[test]
2167    fn is_vendor_path_normal_src_is_not_vendor() {
2168        assert!(!is_vendor_path(Path::new("src/lib.rs")));
2169    }
2170
2171    #[test]
2172    fn is_excluded_dir_path_matches_excluded() {
2173        let excluded = vec![".git".into(), "target".into()];
2174        assert!(is_excluded_dir_path(Path::new(".git/config"), &excluded));
2175    }
2176
2177    #[test]
2178    fn is_excluded_dir_path_non_excluded_is_ok() {
2179        let excluded = vec![".git".into(), "target".into()];
2180        assert!(!is_excluded_dir_path(Path::new("src/main.rs"), &excluded));
2181    }
2182
2183    #[test]
2184    fn decode_bytes_utf8_bom_stripped() {
2185        let bytes = b"\xef\xbb\xbffn main() {}";
2186        let (text, encoding, _) = decode_bytes(bytes).unwrap();
2187        // BOM is detected — encoding label includes "bom" indicator
2188        assert!(
2189            encoding.contains("utf-8"),
2190            "should be utf-8 variant, got {encoding}"
2191        );
2192        assert!(text.starts_with("fn"));
2193    }
2194
2195    #[test]
2196    fn decode_bytes_plain_utf8() {
2197        let bytes = b"hello world";
2198        let (text, encoding, warnings) = decode_bytes(bytes).unwrap();
2199        assert_eq!(encoding, "utf-8");
2200        assert_eq!(text, "hello world");
2201        assert!(warnings.is_empty());
2202    }
2203}