Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot, WatchedDirsStore};
13
14use std::collections::{BTreeMap, BTreeSet, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
18use std::sync::Arc;
19
20use anyhow::{Context, Result};
21use chrono::{DateTime, Utc};
22use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
23use globset::{Glob, GlobSet, GlobSetBuilder};
24use ignore::WalkBuilder;
25use serde::{Deserialize, Serialize};
26use uuid::Uuid;
27
28use sloc_config::{
29    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
30    FailureBehavior, MixedLinePolicy,
31};
32use sloc_languages::{
33    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
34    RawLineCounts,
35};
36
37// ── Detection sample sizes and thresholds ────────────────────────────────────
38
39/// Maximum number of worker threads used for parallel file analysis.
40const MAX_ANALYSIS_THREADS: usize = 16;
41/// Fallback thread count when `available_parallelism` is unavailable.
42const DEFAULT_ANALYSIS_THREADS: usize = 4;
43/// Byte sample used to detect `@generated` markers.
44const GENERATED_SAMPLE_BYTES: usize = 1024;
45/// Byte sample used to detect minified files via line-length heuristic.
46const MINIFIED_SAMPLE_BYTES: usize = 4096;
47/// Longest line length above which a file is considered minified.
48const MINIFIED_LINE_THRESHOLD: usize = 2000;
49/// Byte sample used to detect binary files via null-byte scan.
50const BINARY_SAMPLE_BYTES: usize = 8192;
51
52/// Atomics shared between `analyze()` and the caller so the caller can poll scan progress.
53pub struct ProgressCounters {
54    /// Number of candidate files processed so far (incremented per file, across all threads).
55    pub files_done: Arc<AtomicUsize>,
56    /// Total candidate files discovered (set before parallel analysis begins).
57    pub files_total: Arc<AtomicUsize>,
58}
59
60/// Three-way outcome for metadata-level policy checks.
61enum MetadataPolicyOutcome {
62    /// Skip this file — include the record in output.
63    Skip(Box<FileRecord>),
64    /// Exclude this file entirely — no record in output (include-glob miss).
65    Exclude,
66    /// Continue to content checks.
67    Continue,
68}
69
70#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
71#[serde(rename_all = "snake_case")]
72pub enum FileStatus {
73    AnalyzedExact,
74    AnalyzedBestEffort,
75    SkippedBinary,
76    SkippedDecodeError,
77    SkippedUnsupported,
78    SkippedByPolicy,
79    ErrorInternal,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, Default)]
83pub struct EffectiveCounts {
84    pub code_lines: u64,
85    pub comment_lines: u64,
86    pub blank_lines: u64,
87    pub mixed_lines_separate: u64,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct ToolMetadata {
92    pub name: String,
93    pub version: String,
94    pub run_id: String,
95    pub timestamp_utc: DateTime<Utc>,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct EnvironmentMetadata {
100    pub operating_system: String,
101    pub architecture: String,
102    pub runtime_mode: String,
103    pub initiator_username: String,
104    pub initiator_hostname: String,
105    /// CI system name when the scan runs inside a known CI environment (Jenkins,
106    /// GitHub Actions, GitLab CI, …). `None` for interactive / local runs.
107    #[serde(default, skip_serializing_if = "Option::is_none")]
108    pub ci_name: Option<String>,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize, Default)]
112pub struct SummaryTotals {
113    pub files_considered: u64,
114    pub files_analyzed: u64,
115    pub files_skipped: u64,
116    pub total_physical_lines: u64,
117    pub code_lines: u64,
118    pub comment_lines: u64,
119    pub blank_lines: u64,
120    pub mixed_lines_separate: u64,
121    #[serde(default)]
122    pub functions: u64,
123    #[serde(default)]
124    pub classes: u64,
125    #[serde(default)]
126    pub variables: u64,
127    #[serde(default)]
128    pub imports: u64,
129    #[serde(default)]
130    pub test_count: u64,
131    /// Lexically detected test assertion call lines across all analyzed files.
132    #[serde(default)]
133    pub test_assertion_count: u64,
134    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
135    #[serde(default)]
136    pub test_suite_count: u64,
137    /// Aggregated from LCOV data when provided.
138    #[serde(default)]
139    pub coverage_lines_found: u64,
140    #[serde(default)]
141    pub coverage_lines_hit: u64,
142    #[serde(default)]
143    pub coverage_functions_found: u64,
144    #[serde(default)]
145    pub coverage_functions_hit: u64,
146    #[serde(default)]
147    pub coverage_branches_found: u64,
148    #[serde(default)]
149    pub coverage_branches_hit: u64,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct LanguageSummary {
154    pub language: Language,
155    pub files: u64,
156    pub total_physical_lines: u64,
157    pub code_lines: u64,
158    pub comment_lines: u64,
159    pub blank_lines: u64,
160    pub mixed_lines_separate: u64,
161    #[serde(default)]
162    pub functions: u64,
163    #[serde(default)]
164    pub classes: u64,
165    #[serde(default)]
166    pub variables: u64,
167    #[serde(default)]
168    pub imports: u64,
169    #[serde(default)]
170    pub test_count: u64,
171    #[serde(default)]
172    pub test_assertion_count: u64,
173    #[serde(default)]
174    pub test_suite_count: u64,
175    #[serde(default)]
176    pub coverage_lines_found: u64,
177    #[serde(default)]
178    pub coverage_lines_hit: u64,
179    #[serde(default)]
180    pub coverage_functions_found: u64,
181    #[serde(default)]
182    pub coverage_functions_hit: u64,
183    #[serde(default)]
184    pub coverage_branches_found: u64,
185    #[serde(default)]
186    pub coverage_branches_hit: u64,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct FileRecord {
191    pub path: String,
192    pub relative_path: String,
193    pub language: Option<Language>,
194    pub size_bytes: u64,
195    pub detected_encoding: Option<String>,
196    pub raw_line_categories: RawLineCounts,
197    pub effective_counts: EffectiveCounts,
198    pub status: FileStatus,
199    pub warnings: Vec<String>,
200    pub generated: bool,
201    pub minified: bool,
202    pub vendor: bool,
203    pub parse_mode: Option<ParseMode>,
204    #[serde(skip_serializing_if = "Option::is_none")]
205    pub submodule: Option<String>,
206    /// Line/function/branch coverage from an external LCOV file, when provided.
207    #[serde(default, skip_serializing_if = "Option::is_none")]
208    pub coverage: Option<FileCoverage>,
209}
210
211/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
212#[derive(Debug, Clone, Serialize, Deserialize)]
213pub struct SubmoduleSummary {
214    pub name: String,
215    pub relative_path: String,
216    pub files_analyzed: u64,
217    pub total_physical_lines: u64,
218    pub code_lines: u64,
219    pub comment_lines: u64,
220    pub blank_lines: u64,
221    pub language_summaries: Vec<LanguageSummary>,
222}
223
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct AnalysisRun {
226    pub tool: ToolMetadata,
227    pub environment: EnvironmentMetadata,
228    pub effective_configuration: AppConfig,
229    pub input_roots: Vec<String>,
230    pub summary_totals: SummaryTotals,
231    pub totals_by_language: Vec<LanguageSummary>,
232    pub per_file_records: Vec<FileRecord>,
233    pub skipped_file_records: Vec<FileRecord>,
234    pub warnings: Vec<String>,
235    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
236    #[serde(default, skip_serializing_if = "Vec::is_empty")]
237    pub submodule_summaries: Vec<SubmoduleSummary>,
238    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub git_commit_short: Option<String>,
241    /// Full git commit SHA at scan time, if the project is a git repo.
242    #[serde(default, skip_serializing_if = "Option::is_none")]
243    pub git_commit_long: Option<String>,
244    /// Git branch active at scan time, if the project is a git repo.
245    #[serde(default, skip_serializing_if = "Option::is_none")]
246    pub git_branch: Option<String>,
247    /// Author of the last git commit at scan time.
248    #[serde(default, skip_serializing_if = "Option::is_none")]
249    pub git_commit_author: Option<String>,
250    /// Comma-separated git tags pointing at HEAD at scan time.
251    #[serde(default, skip_serializing_if = "Option::is_none")]
252    pub git_tags: Option<String>,
253    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
254    #[serde(default, skip_serializing_if = "Option::is_none")]
255    pub git_nearest_tag: Option<String>,
256    /// ISO 8601 author-date of the last git commit at scan time.
257    #[serde(default, skip_serializing_if = "Option::is_none")]
258    pub git_commit_date: Option<String>,
259    /// URL of the `origin` remote as recorded in `.git/config` at scan time.
260    #[serde(default, skip_serializing_if = "Option::is_none")]
261    pub git_remote_url: Option<String>,
262}
263
264#[derive(Default)]
265struct GitInfo {
266    commit_short: Option<String>,
267    commit_long: Option<String>,
268    branch: Option<String>,
269    author: Option<String>,
270    tags: Option<String>,
271    nearest_tag: Option<String>,
272    commit_date: Option<String>,
273    remote_url: Option<String>,
274}
275
276/// Locate the `.git` directory by walking up from `start`.
277/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
278/// submodules. Returns `None` if no git repo is found.
279fn find_git_dir(start: &Path) -> Option<PathBuf> {
280    let mut current = Some(start);
281    while let Some(dir) = current {
282        let candidate = dir.join(".git");
283        if candidate.is_dir() {
284            return Some(candidate);
285        }
286        if candidate.is_file() {
287            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
288                return Some(resolved);
289            }
290        }
291        current = dir.parent();
292    }
293    None
294}
295
296/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
297/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
298/// or if the resolved path is not an existing directory.
299fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
300    let content = fs::read_to_string(file).ok()?;
301    let ptr = content.trim().strip_prefix("gitdir: ")?;
302    // Normalise forward-slash paths to the OS separator so that Path operations
303    // (join, exists, canonicalize) work correctly on Windows.
304    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
305    let resolved = if Path::new(&ptr_native).is_absolute() {
306        PathBuf::from(&ptr_native)
307    } else {
308        base_dir.join(&ptr_native)
309    };
310    // canonicalize resolves ".." components and symlinks; fall back to the
311    // un-canonicalized path if it fails (e.g. some Windows configurations
312    // return a UNC "\\?\" prefix that confuses later path operations).
313    let final_path = resolved.canonicalize().unwrap_or(resolved);
314    if final_path.is_dir() {
315        Some(final_path)
316    } else {
317        None
318    }
319}
320
321/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
322/// Checks loose ref files first, then `packed-refs`.
323fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
324    // Build the OS-native path to the loose ref file by joining each
325    // forward-slash component individually.  This produces the correct
326    // separator on every platform without any manual replacement.
327    let ref_path = refname
328        .split('/')
329        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
330    if ref_path.exists() {
331        let sha = fs::read_to_string(&ref_path)
332            .ok()
333            .map(|s| s.trim().to_string())
334            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
335        if sha.is_some() {
336            return sha;
337        }
338    }
339    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
340    // comments; lines starting with '^' are peeled tag objects to skip).
341    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
342    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
343    for line in packed.lines() {
344        if line.starts_with('#') || line.starts_with('^') {
345            continue;
346        }
347        let mut cols = line.splitn(2, ' ');
348        let sha = cols.next()?;
349        let name = cols.next()?.trim();
350        if name == refname {
351            return Some(sha.to_string());
352        }
353    }
354    None
355}
356
357/// Extract the URL value from a `url = <value>` git-config line, returning `None` if absent or empty.
358fn parse_url_line(line: &str) -> Option<&str> {
359    let rest = line.strip_prefix("url")?;
360    let rest = rest.trim_start_matches([' ', '\t']);
361    let url = rest.strip_prefix('=')?.trim();
362    if url.is_empty() {
363        None
364    } else {
365        Some(url)
366    }
367}
368
369/// Parse `.git/config` and return the URL of the `origin` remote, if present.
370fn read_git_remote_url(git_dir: &Path) -> Option<String> {
371    let config = fs::read_to_string(git_dir.join("config")).ok()?;
372    let mut in_origin = false;
373    for line in config.lines() {
374        let trimmed = line.trim();
375        if trimmed.starts_with('[') {
376            in_origin = trimmed == r#"[remote "origin"]"#;
377        } else if in_origin {
378            if let Some(url) = parse_url_line(trimmed) {
379                return Some(url.to_owned());
380            }
381        }
382    }
383    None
384}
385
386/// Detect git metadata by reading `.git/` files directly — no `git` executable
387/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
388/// reflogs.
389fn detect_git_for_run(project_path: &Path) -> GitInfo {
390    // Resolve the CI branch early so it can fill in any gap in git metadata.
391    let ci_branch = ci_branch_from_env();
392
393    let Some(git_dir) = find_git_dir(project_path) else {
394        // No .git directory (e.g. scanning a non-repo path in CI). Use whatever
395        // the CI system tells us about the branch.
396        return GitInfo {
397            branch: ci_branch,
398            ..GitInfo::default()
399        };
400    };
401
402    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
403        Ok(s) => s.trim().to_string(),
404        Err(_) => {
405            return GitInfo {
406                branch: ci_branch,
407                ..GitInfo::default()
408            }
409        }
410    };
411
412    let (branch_from_head, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
413        || {
414            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
415                // Detached HEAD — HEAD file is the commit SHA (common in CI checkouts).
416                (None, Some(head_raw[..40].to_string()))
417            } else {
418                (None, None)
419            }
420        },
421        |refname| {
422            let branch = refname
423                .strip_prefix("refs/heads/")
424                .map(|b| b.trim().to_string());
425            let sha = resolve_ref(&git_dir, refname.trim());
426            (branch, sha)
427        },
428    );
429    // Prefer the branch name derived from the HEAD ref; fall back to the CI
430    // env var (covers detached-HEAD checkouts done by Jenkins, GitHub Actions, etc.).
431    let branch = branch_from_head.or(ci_branch);
432
433    let commit_short = commit_long
434        .as_deref()
435        .map(|s| s.chars().take(7).collect::<String>());
436
437    let author = run_git_cmd(project_path, &["log", "-1", "--format=%an", "HEAD"]);
438    let commit_date = run_git_cmd(project_path, &["log", "-1", "--format=%aI", "HEAD"]);
439    let remote_url = read_git_remote_url(&git_dir);
440
441    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
442    // but don't block on it. If git isn't available these will simply be None.
443    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
444        t.lines()
445            .filter(|l| !l.is_empty())
446            .collect::<Vec<_>>()
447            .join(", ")
448    });
449    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
450
451    GitInfo {
452        commit_short,
453        commit_long,
454        branch,
455        author,
456        tags,
457        nearest_tag,
458        commit_date,
459        remote_url,
460    }
461}
462
463/// Run a git command as a best-effort supplemental source.
464fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
465    // Try the bare name first (works when git is on PATH), then fall back to
466    // absolute paths for service accounts that run with a stripped PATH.
467    // Unix paths silently fail on Windows and vice-versa.
468    let candidates: &[&str] = &[
469        // Works on all platforms when git is on PATH
470        "git",
471        // Common Linux / macOS install locations
472        "/usr/bin/git",
473        "/usr/local/bin/git",
474        "/opt/homebrew/bin/git",
475        // Git for Windows default installation paths
476        r"C:\Program Files\Git\cmd\git.exe",
477        r"C:\Program Files\Git\bin\git.exe",
478        r"C:\Program Files (x86)\Git\cmd\git.exe",
479    ];
480    for &exe in candidates {
481        let result = std::process::Command::new(exe)
482            .args(["-c", "safe.directory=*"])
483            .args(args)
484            .current_dir(dir)
485            .output()
486            .ok()
487            .filter(|o| o.status.success())
488            .and_then(|o| String::from_utf8(o.stdout).ok())
489            .map(|s| s.trim().to_string())
490            .filter(|s| !s.is_empty());
491        if result.is_some() {
492            return result;
493        }
494    }
495    None
496}
497
498/// Return the name of the CI system if the process is running inside one.
499fn detect_ci_system() -> Option<&'static str> {
500    let ev = |k: &str| std::env::var(k).is_ok();
501    let ev_true = |k: &str| std::env::var(k).as_deref() == Ok("true");
502    if ev("JENKINS_URL") || ev("JENKINS_HOME") || ev("BUILD_URL") {
503        return Some("Jenkins");
504    }
505    if ev_true("GITHUB_ACTIONS") {
506        return Some("GitHub Actions");
507    }
508    if ev_true("GITLAB_CI") {
509        return Some("GitLab CI");
510    }
511    if ev_true("CIRCLECI") {
512        return Some("CircleCI");
513    }
514    if ev_true("TRAVIS") {
515        return Some("Travis CI");
516    }
517    if ev_true("TF_BUILD") {
518        return Some("Azure DevOps");
519    }
520    if ev("TEAMCITY_VERSION") {
521        return Some("TeamCity");
522    }
523    None
524}
525
526/// Read the current branch name from well-known CI environment variables.
527/// Called as a fallback when the git HEAD is detached (common in CI checkouts).
528fn ci_branch_from_env() -> Option<String> {
529    const VARS: &[&str] = &[
530        "BRANCH_NAME",        // Jenkins Pipeline
531        "GIT_BRANCH",         // Jenkins Freestyle (may carry "origin/<branch>")
532        "GITHUB_REF_NAME",    // GitHub Actions
533        "CI_COMMIT_BRANCH",   // GitLab CI
534        "CIRCLE_BRANCH",      // CircleCI
535        "TRAVIS_BRANCH",      // Travis CI
536        "BUILD_SOURCEBRANCH", // Azure DevOps (may carry "refs/heads/<branch>")
537    ];
538    for &var in VARS {
539        if let Ok(val) = std::env::var(var) {
540            let val = val.trim();
541            let val = val
542                .strip_prefix("refs/heads/")
543                .or_else(|| val.strip_prefix("origin/"))
544                .unwrap_or(val);
545            if !val.is_empty() && val != "HEAD" {
546                return Some(val.to_string());
547            }
548        }
549    }
550    None
551}
552
553fn get_current_username() -> String {
554    std::env::var("USERNAME")
555        .or_else(|_| std::env::var("USER"))
556        .unwrap_or_else(|_| "unknown".to_string())
557}
558
559fn non_empty_env(var: &str) -> Option<String> {
560    let v = std::env::var(var).ok()?;
561    if v.is_empty() {
562        None
563    } else {
564        Some(v)
565    }
566}
567
568fn is_jenkins_env() -> bool {
569    std::env::var("JENKINS_URL").is_ok()
570        || std::env::var("JENKINS_HOME").is_ok()
571        || std::env::var("BUILD_URL").is_ok()
572}
573
574fn get_hostname() -> String {
575    // In CI environments prefer a human-readable agent/runner identifier over
576    // whatever hostname the container was assigned.
577    if is_jenkins_env() {
578        if let Some(n) = non_empty_env("NODE_NAME") {
579            return n;
580        }
581    }
582    if std::env::var("GITHUB_ACTIONS").as_deref() == Ok("true") {
583        if let Some(r) = non_empty_env("RUNNER_NAME") {
584            return r;
585        }
586    }
587    if std::env::var("GITLAB_CI").as_deref() == Ok("true") {
588        if let Some(r) = non_empty_env("CI_RUNNER_DESCRIPTION") {
589            return r;
590        }
591    }
592    std::env::var("COMPUTERNAME")
593        .or_else(|_| std::env::var("HOSTNAME"))
594        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
595        .unwrap_or_else(|_| "unknown".to_string())
596}
597
598/// Walk a single directory root and collect file records into the output vectors.
599#[allow(clippy::too_many_arguments)]
600fn walk_root(
601    root: &Path,
602    config: &AppConfig,
603    include_globs: Option<&GlobSet>,
604    exclude_globs: Option<&GlobSet>,
605    enabled_languages: Option<&BTreeSet<Language>>,
606    seen_paths: &mut HashSet<PathBuf>,
607    analyzed: &mut Vec<FileRecord>,
608    skipped: &mut Vec<FileRecord>,
609    warnings: &mut Vec<String>,
610    cancel: Option<&AtomicBool>,
611    progress: Option<&ProgressCounters>,
612) -> Result<()> {
613    let mut builder = WalkBuilder::new(root);
614    builder
615        .follow_links(config.discovery.follow_symlinks)
616        .hidden(config.discovery.ignore_hidden_files)
617        .ignore(config.discovery.honor_ignore_files)
618        .parents(config.discovery.honor_ignore_files)
619        .git_ignore(config.discovery.honor_ignore_files)
620        .git_global(config.discovery.honor_ignore_files)
621        .git_exclude(config.discovery.honor_ignore_files);
622
623    let paths = collect_walk_paths(&builder, seen_paths, warnings);
624    if paths.is_empty() {
625        return Ok(());
626    }
627
628    if let Some(p) = progress {
629        p.files_total.fetch_add(paths.len(), Ordering::Relaxed);
630    }
631
632    let chunk_results = run_parallel_analysis(
633        &paths,
634        root,
635        config,
636        include_globs,
637        exclude_globs,
638        enabled_languages,
639        cancel,
640        progress,
641    )?;
642    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
643}
644
645fn collect_walk_paths(
646    builder: &WalkBuilder,
647    seen_paths: &mut HashSet<PathBuf>,
648    warnings: &mut Vec<String>,
649) -> Vec<PathBuf> {
650    // build_parallel() walks the directory tree across multiple threads (work-stealing
651    // internally), which is meaningfully faster for deeply nested repos with many directories.
652    // We collect results via an MPSC channel so each walker thread sends without contention.
653    let (tx, rx) = std::sync::mpsc::channel::<std::result::Result<PathBuf, String>>();
654
655    builder.build_parallel().run(|| {
656        let tx = tx.clone();
657        Box::new(move |entry| {
658            match entry {
659                Err(e) => {
660                    let _ = tx.send(Err(format!("discovery warning: {e}")));
661                }
662                Ok(e) => {
663                    let path = e.into_path();
664                    if !path.is_dir() {
665                        let _ = tx.send(Ok(path));
666                    }
667                }
668            }
669            ignore::WalkState::Continue
670        })
671    });
672
673    // Drop the sender that the outer scope holds; the per-thread clones were dropped when
674    // run() returned (all threads finished). Dropping this last sender closes the channel.
675    drop(tx);
676
677    rx.into_iter()
678        .filter_map(|msg| match msg {
679            Ok(path) => {
680                if seen_paths.insert(path.clone()) {
681                    Some(path)
682                } else {
683                    None
684                }
685            }
686            Err(warn) => {
687                warnings.push(warn);
688                None
689            }
690        })
691        .collect()
692}
693
694/// Inner work loop executed by each analysis thread.
695#[allow(clippy::too_many_arguments)]
696fn worker_loop(
697    paths: &[PathBuf],
698    root: &Path,
699    config: &AppConfig,
700    include_globs: Option<&GlobSet>,
701    exclude_globs: Option<&GlobSet>,
702    enabled_languages: Option<&BTreeSet<Language>>,
703    cancel: Option<&AtomicBool>,
704    next_index: &AtomicUsize,
705    files_done: Option<&AtomicUsize>,
706) -> Vec<Result<Option<FileRecord>>> {
707    let mut results = Vec::new();
708    loop {
709        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
710            results.push(Err(anyhow::anyhow!("analysis cancelled")));
711            break;
712        }
713        let i = next_index.fetch_add(1, Ordering::Relaxed);
714        if i >= paths.len() {
715            break;
716        }
717        results.push(analyze_candidate_file(
718            &paths[i],
719            root,
720            config,
721            include_globs,
722            exclude_globs,
723            enabled_languages,
724        ));
725        if let Some(fd) = files_done {
726            fd.fetch_add(1, Ordering::Relaxed);
727        }
728    }
729    results
730}
731
732#[allow(clippy::too_many_arguments)]
733fn run_parallel_analysis(
734    paths: &[PathBuf],
735    root: &Path,
736    config: &AppConfig,
737    include_globs: Option<&GlobSet>,
738    exclude_globs: Option<&GlobSet>,
739    enabled_languages: Option<&BTreeSet<Language>>,
740    cancel: Option<&AtomicBool>,
741    progress: Option<&ProgressCounters>,
742) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
743    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
744        n.get().min(MAX_ANALYSIS_THREADS)
745    });
746    // Shared work-queue index: each thread atomically claims the next path to process.
747    // This eliminates static-chunk load imbalance — threads that finish early immediately
748    // pick up more work instead of sitting idle while one overloaded chunk finishes.
749    let next_index = AtomicUsize::new(0);
750    let files_done: Option<&AtomicUsize> = progress.map(|p| p.files_done.as_ref());
751
752    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
753        // IMPORTANT: collect ALL handles before joining any of them.
754        // A lazy spawn-then-join chain would serialize threads one at a time.
755        let mut handles = Vec::with_capacity(thread_count);
756        for _ in 0..thread_count {
757            handles.push(s.spawn(|| {
758                worker_loop(
759                    paths,
760                    root,
761                    config,
762                    include_globs,
763                    exclude_globs,
764                    enabled_languages,
765                    cancel,
766                    &next_index,
767                    files_done,
768                )
769            }));
770        }
771        handles
772            .into_iter()
773            .map(|h| {
774                h.join()
775                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
776            })
777            .collect()
778    })
779}
780
781fn merge_chunk_results(
782    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
783    analyzed: &mut Vec<FileRecord>,
784    skipped: &mut Vec<FileRecord>,
785    warnings: &mut Vec<String>,
786) -> Result<()> {
787    for chunk in chunk_results {
788        for result in chunk {
789            if let Some(record) = result? {
790                push_record(record, analyzed, skipped, warnings);
791            }
792        }
793    }
794    Ok(())
795}
796
797/// Label each analyzed file with its submodule and build per-submodule summaries.
798fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
799    let root = config.discovery.root_paths[0]
800        .canonicalize()
801        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
802    let submodules = detect_submodules(&root);
803    if submodules.is_empty() {
804        return Vec::new();
805    }
806
807    for file in analyzed.iter_mut() {
808        for (name, sub_path) in &submodules {
809            let prefix = sub_path.to_string_lossy().replace('\\', "/");
810            let rel = &file.relative_path;
811            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
812                file.submodule = Some(name.clone());
813                break;
814            }
815        }
816    }
817
818    build_submodule_summaries(analyzed, &submodules)
819}
820
821/// Assemble the final `AnalysisRun` from collected records and metadata.
822fn assemble_run(
823    config: &AppConfig,
824    runtime_mode: &str,
825    analyzed: Vec<FileRecord>,
826    skipped: Vec<FileRecord>,
827    warnings: Vec<String>,
828    submodule_summaries: Vec<SubmoduleSummary>,
829) -> AnalysisRun {
830    let summary = build_summary(&analyzed, &skipped);
831    let language_summaries = build_language_summaries(&analyzed);
832
833    let first_root = config
834        .discovery
835        .root_paths
836        .first()
837        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
838    let git = first_root
839        .as_deref()
840        .map(detect_git_for_run)
841        .unwrap_or_default();
842
843    let now = Utc::now();
844    let run_id = {
845        let uuid_suffix = Uuid::new_v4().simple().to_string();
846        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
847    };
848
849    AnalysisRun {
850        tool: ToolMetadata {
851            name: "sloc".into(),
852            version: env!("CARGO_PKG_VERSION").into(),
853            run_id,
854            timestamp_utc: now,
855        },
856        environment: EnvironmentMetadata {
857            operating_system: std::env::consts::OS.into(),
858            architecture: std::env::consts::ARCH.into(),
859            runtime_mode: runtime_mode.into(),
860            initiator_username: get_current_username(),
861            initiator_hostname: get_hostname(),
862            ci_name: detect_ci_system().map(str::to_string),
863        },
864        effective_configuration: config.clone(),
865        input_roots: config
866            .discovery
867            .root_paths
868            .iter()
869            .map(|p| path_to_string(p))
870            .collect(),
871        summary_totals: summary,
872        totals_by_language: language_summaries,
873        per_file_records: analyzed,
874        skipped_file_records: skipped,
875        warnings,
876        submodule_summaries,
877        git_commit_short: git.commit_short,
878        git_commit_long: git.commit_long,
879        git_branch: git.branch,
880        git_commit_author: git.author,
881        git_tags: git.tags,
882        git_nearest_tag: git.nearest_tag,
883        git_commit_date: git.commit_date,
884        git_remote_url: git.remote_url,
885    }
886}
887
888/// # Errors
889///
890/// Returns an error if the config is invalid, root paths cannot be walked, or any file
891/// analysis step fails in a way that cannot be recovered from.
892#[allow(clippy::too_many_lines)]
893pub fn analyze(
894    config: &AppConfig,
895    runtime_mode: &str,
896    cancel: Option<&AtomicBool>,
897    progress: Option<&ProgressCounters>,
898) -> Result<AnalysisRun> {
899    config.validate()?;
900
901    if config.discovery.root_paths.is_empty() {
902        anyhow::bail!("no input paths were provided");
903    }
904
905    let include_globs = compile_globset(&config.discovery.include_globs)?;
906    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
907    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
908
909    let mut analyzed = Vec::new();
910    let mut skipped = Vec::new();
911    let mut warnings = Vec::new();
912    let mut seen_paths = HashSet::new();
913
914    for root in &config.discovery.root_paths {
915        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
916            anyhow::bail!("analysis cancelled");
917        }
918
919        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
920
921        if root.is_file() {
922            if let Some(record) = analyze_candidate_file(
923                &root,
924                root.parent().unwrap_or_else(|| Path::new(".")),
925                config,
926                include_globs.as_ref(),
927                exclude_globs.as_ref(),
928                enabled_languages.as_ref(),
929            )? {
930                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
931            }
932            continue;
933        }
934
935        walk_root(
936            &root,
937            config,
938            include_globs.as_ref(),
939            exclude_globs.as_ref(),
940            enabled_languages.as_ref(),
941            &mut seen_paths,
942            &mut analyzed,
943            &mut skipped,
944            &mut warnings,
945            cancel,
946            progress,
947        )?;
948    }
949
950    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
951    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
952
953    // Submodule detection: label each file with its submodule and build per-submodule summaries.
954    let submodule_summaries = if config.discovery.submodule_breakdown {
955        process_submodules(config, &mut analyzed)
956    } else {
957        Vec::new()
958    };
959
960    attach_coverage(config, &mut analyzed, &mut warnings);
961
962    Ok(assemble_run(
963        config,
964        runtime_mode,
965        analyzed,
966        skipped,
967        warnings,
968        submodule_summaries,
969    ))
970}
971
972fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
973    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
974    else {
975        return;
976    };
977    tracing::debug!(path = %cov_path.display(), "loading coverage file");
978    match fs::read_to_string(&cov_path) {
979        Ok(content) => {
980            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
981            let mut matched: u32 = 0;
982            let mut unmatched: u32 = 0;
983            for record in analyzed.iter_mut() {
984                record.coverage =
985                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
986                if record.coverage.is_some() {
987                    matched += 1;
988                } else {
989                    unmatched += 1;
990                }
991            }
992            tracing::debug!(
993                path = %cov_path.display(),
994                coverage_entries = cov_map.len(),
995                files_matched = matched,
996                files_unmatched = unmatched,
997                "coverage attached"
998            );
999            if unmatched > 0 && matched == 0 {
1000                tracing::warn!(
1001                    path = %cov_path.display(),
1002                    "coverage file loaded but no source files could be matched — check that paths in the coverage report match the scanned directory"
1003                );
1004            }
1005        }
1006        Err(e) => {
1007            tracing::warn!(path = %cov_path.display(), error = %e, "coverage file could not be read");
1008            warnings.push(format!(
1009                "coverage file '{}' could not be read: {e}",
1010                cov_path.display()
1011            ));
1012        }
1013    }
1014}
1015
1016fn push_record(
1017    record: FileRecord,
1018    analyzed: &mut Vec<FileRecord>,
1019    skipped: &mut Vec<FileRecord>,
1020    warnings: &mut Vec<String>,
1021) {
1022    warnings.extend(
1023        record
1024            .warnings
1025            .iter()
1026            .map(|warning| format!("{}: {warning}", record.relative_path)),
1027    );
1028
1029    match record.status {
1030        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
1031        _ => skipped.push(record),
1032    }
1033}
1034
1035/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
1036#[inline]
1037fn skip_with_reason(
1038    path: &Path,
1039    root: &Path,
1040    size: u64,
1041    reason: impl Into<String>,
1042) -> MetadataPolicyOutcome {
1043    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
1044        path,
1045        root,
1046        size,
1047        FileStatus::SkippedByPolicy,
1048        vec![reason.into()],
1049    )))
1050}
1051
1052/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
1053/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
1054/// or `Continue` to proceed to content checks.
1055#[allow(clippy::too_many_arguments)]
1056fn check_metadata_policy(
1057    path: &Path,
1058    root: &Path,
1059    relative_path: &str,
1060    metadata: &fs::Metadata,
1061    config: &AppConfig,
1062    include_globs: Option<&GlobSet>,
1063    exclude_globs: Option<&GlobSet>,
1064) -> MetadataPolicyOutcome {
1065    let size = metadata.len();
1066
1067    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
1068        return skip_with_reason(path, root, size, "symlink skipped by policy");
1069    }
1070    if file_name_eq(path, ".gitignore") {
1071        return skip_with_reason(path, root, size, ".gitignore is always excluded");
1072    }
1073    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
1074        return skip_with_reason(path, root, size, "path matched excluded directory setting");
1075    }
1076    if size > config.discovery.max_file_size_bytes {
1077        return skip_with_reason(
1078            path,
1079            root,
1080            size,
1081            format!(
1082                "file exceeded max_file_size_bytes ({})",
1083                config.discovery.max_file_size_bytes
1084            ),
1085        );
1086    }
1087    if let Some(globs) = include_globs {
1088        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
1089            return MetadataPolicyOutcome::Exclude;
1090        }
1091    }
1092    if let Some(globs) = exclude_globs {
1093        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
1094            return skip_with_reason(path, root, size, "path matched exclude glob");
1095        }
1096    }
1097    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
1098        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
1099    }
1100
1101    MetadataPolicyOutcome::Continue
1102}
1103
1104struct ContentPolicyResult {
1105    vendor: bool,
1106    generated: bool,
1107    minified: bool,
1108    skip_record: Option<FileRecord>,
1109}
1110
1111/// Apply content-level policy checks (vendor, generated, minified).
1112/// `skip_record` is `Some` when the file should be skipped.
1113fn check_content_policy(
1114    path: &Path,
1115    root: &Path,
1116    size_bytes: u64,
1117    bytes: &[u8],
1118    config: &AppConfig,
1119) -> ContentPolicyResult {
1120    let vendor = is_vendor_path(path);
1121    if vendor && config.analysis.vendor_directory_detection {
1122        return ContentPolicyResult {
1123            vendor,
1124            generated: false,
1125            minified: false,
1126            skip_record: Some(skipped_record(
1127                path,
1128                root,
1129                size_bytes,
1130                FileStatus::SkippedByPolicy,
1131                vec!["vendor file skipped by policy".into()],
1132            )),
1133        };
1134    }
1135
1136    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
1137    if generated {
1138        return ContentPolicyResult {
1139            vendor,
1140            generated,
1141            minified: false,
1142            skip_record: Some(skipped_record(
1143                path,
1144                root,
1145                size_bytes,
1146                FileStatus::SkippedByPolicy,
1147                vec!["generated file skipped by policy".into()],
1148            )),
1149        };
1150    }
1151
1152    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
1153    if minified {
1154        return ContentPolicyResult {
1155            vendor,
1156            generated,
1157            minified,
1158            skip_record: Some(skipped_record(
1159                path,
1160                root,
1161                size_bytes,
1162                FileStatus::SkippedByPolicy,
1163                vec!["minified file skipped by policy".into()],
1164            )),
1165        };
1166    }
1167
1168    ContentPolicyResult {
1169        vendor,
1170        generated,
1171        minified,
1172        skip_record: None,
1173    }
1174}
1175
1176/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
1177fn decode_file_contents(
1178    path: &Path,
1179    root: &Path,
1180    size_bytes: u64,
1181    bytes: &[u8],
1182    config: &AppConfig,
1183) -> Result<Option<(String, String, Vec<String>)>> {
1184    if is_binary(bytes) {
1185        return match config.analysis.binary_file_behavior {
1186            BinaryFileBehavior::Skip => Ok(None),
1187            BinaryFileBehavior::Fail => {
1188                anyhow::bail!("binary file encountered: {}", path.display())
1189            }
1190        };
1191    }
1192
1193    match decode_bytes(bytes) {
1194        Ok(result) => Ok(Some(result)),
1195        Err(err) => match config.analysis.decode_failure_behavior {
1196            FailureBehavior::WarnSkip => {
1197                // Caller will handle the None as a SkippedDecodeError record.
1198                // We use a sentinel: return Ok(None) but encode the error into a field.
1199                // Instead, propagate as a skipped record via the caller.
1200                let _ = (path, root, size_bytes); // suppress unused warnings
1201                Err(anyhow::anyhow!("__decode_warn__: {err}"))
1202            }
1203            FailureBehavior::Fail => {
1204                anyhow::bail!("decode failure for {}: {err}", path.display())
1205            }
1206        },
1207    }
1208}
1209
1210#[allow(clippy::too_many_lines)]
1211fn analyze_candidate_file(
1212    path: &Path,
1213    root: &Path,
1214    config: &AppConfig,
1215    include_globs: Option<&GlobSet>,
1216    exclude_globs: Option<&GlobSet>,
1217    enabled_languages: Option<&BTreeSet<Language>>,
1218) -> Result<Option<FileRecord>> {
1219    let metadata = match fs::symlink_metadata(path) {
1220        Ok(metadata) => metadata,
1221        Err(err) => {
1222            return Ok(Some(skipped_record(
1223                path,
1224                root,
1225                0,
1226                FileStatus::ErrorInternal,
1227                vec![format!("failed to read metadata: {err}")],
1228            )));
1229        }
1230    };
1231
1232    let relative_path = relative_path_string(path, root);
1233
1234    // Metadata-level policy checks.
1235    match check_metadata_policy(
1236        path,
1237        root,
1238        &relative_path,
1239        &metadata,
1240        config,
1241        include_globs,
1242        exclude_globs,
1243    ) {
1244        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1245        MetadataPolicyOutcome::Exclude => return Ok(None),
1246        MetadataPolicyOutcome::Continue => {}
1247    }
1248
1249    let bytes = match fs::read(path) {
1250        Ok(bytes) => bytes,
1251        Err(err) => {
1252            return Ok(Some(skipped_record(
1253                path,
1254                root,
1255                metadata.len(),
1256                FileStatus::ErrorInternal,
1257                vec![format!("failed to read file: {err}")],
1258            )));
1259        }
1260    };
1261
1262    // Content-level policy checks (vendor, generated, minified).
1263    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1264    if let Some(record) = content_policy.skip_record {
1265        return Ok(Some(record));
1266    }
1267    let (vendor, generated, minified) = (
1268        content_policy.vendor,
1269        content_policy.generated,
1270        content_policy.minified,
1271    );
1272
1273    // Decode content, handling binary and decode failures.
1274    let (text, encoding, decode_warnings) =
1275        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1276            Ok(Some(result)) => result,
1277            Ok(None) => {
1278                return Ok(Some(skipped_record(
1279                    path,
1280                    root,
1281                    metadata.len(),
1282                    FileStatus::SkippedBinary,
1283                    vec!["binary file skipped by default".into()],
1284                )));
1285            }
1286            Err(err) => {
1287                let msg = err.to_string();
1288                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1289                    return Ok(Some(skipped_record(
1290                        path,
1291                        root,
1292                        metadata.len(),
1293                        FileStatus::SkippedDecodeError,
1294                        vec![warn_msg.to_string()],
1295                    )));
1296                }
1297                return Err(err);
1298            }
1299        };
1300
1301    let first_line = text.lines().next();
1302    let language = detect_language(
1303        path,
1304        first_line,
1305        &config.analysis.extension_overrides,
1306        config.analysis.shebang_detection,
1307    );
1308
1309    let Some(language) = language else {
1310        return Ok(Some(skipped_record(
1311            path,
1312            root,
1313            metadata.len(),
1314            FileStatus::SkippedUnsupported,
1315            vec!["unsupported or undetected language".into()],
1316        )));
1317    };
1318
1319    if let Some(enabled) = enabled_languages {
1320        if !enabled.contains(&language) {
1321            return Ok(Some(skipped_record(
1322                path,
1323                root,
1324                metadata.len(),
1325                FileStatus::SkippedByPolicy,
1326                vec![format!(
1327                    "language {} disabled by configuration",
1328                    language.display_name()
1329                )],
1330            )));
1331        }
1332    }
1333
1334    let ieee_opts = AnalysisOptions {
1335        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1336            == BlankInBlockCommentPolicy::CountAsComment,
1337        collapse_continuation_lines: config.analysis.continuation_line_policy
1338            == ContinuationLinePolicy::CollapseToLogical,
1339    };
1340    let analysis = analyze_text(language, &text, ieee_opts);
1341    let effective_counts = compute_effective_counts(
1342        &analysis.raw,
1343        config.analysis.mixed_line_policy,
1344        config.analysis.python_docstrings_as_comments,
1345        config.analysis.count_compiler_directives,
1346    );
1347
1348    let mut warnings = decode_warnings;
1349    warnings.extend(analysis.warnings.clone());
1350
1351    Ok(Some(FileRecord {
1352        path: path_to_string(path),
1353        relative_path,
1354        language: Some(language),
1355        size_bytes: metadata.len(),
1356        detected_encoding: Some(encoding),
1357        raw_line_categories: analysis.raw,
1358        effective_counts,
1359        status: match analysis.parse_mode {
1360            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1361            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1362        },
1363        warnings,
1364        generated,
1365        minified,
1366        vendor,
1367        parse_mode: Some(analysis.parse_mode),
1368        submodule: None,
1369        coverage: None,
1370    }))
1371}
1372
1373const fn compute_effective_counts(
1374    raw: &RawLineCounts,
1375    mixed_line_policy: MixedLinePolicy,
1376    python_docstrings_as_comments: bool,
1377    count_compiler_directives: bool,
1378) -> EffectiveCounts {
1379    let mut effective = EffectiveCounts {
1380        code_lines: raw.code_only_lines,
1381        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1382        blank_lines: raw.blank_only_lines,
1383        mixed_lines_separate: 0,
1384    };
1385
1386    if python_docstrings_as_comments {
1387        effective.comment_lines += raw.docstring_comment_lines;
1388    } else {
1389        effective.code_lines += raw.docstring_comment_lines;
1390    }
1391
1392    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1393    match mixed_line_policy {
1394        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1395        MixedLinePolicy::CodeAndComment => {
1396            effective.code_lines += mixed_total;
1397            effective.comment_lines += mixed_total;
1398        }
1399        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1400        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1401    }
1402
1403    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1404    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1405    if !count_compiler_directives {
1406        effective.code_lines = effective
1407            .code_lines
1408            .saturating_sub(raw.compiler_directive_lines);
1409    }
1410
1411    effective
1412}
1413
1414fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1415    let mut summary = SummaryTotals {
1416        files_considered: (analyzed.len() + skipped.len()) as u64,
1417        files_analyzed: analyzed.len() as u64,
1418        files_skipped: skipped.len() as u64,
1419        ..Default::default()
1420    };
1421
1422    for record in analyzed {
1423        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1424        summary.code_lines += record.effective_counts.code_lines;
1425        summary.comment_lines += record.effective_counts.comment_lines;
1426        summary.blank_lines += record.effective_counts.blank_lines;
1427        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1428        summary.functions += record.raw_line_categories.functions;
1429        summary.classes += record.raw_line_categories.classes;
1430        summary.variables += record.raw_line_categories.variables;
1431        summary.imports += record.raw_line_categories.imports;
1432        summary.test_count += record.raw_line_categories.test_count;
1433        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1434        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1435        if let Some(cov) = &record.coverage {
1436            summary.coverage_lines_found += u64::from(cov.lines_found);
1437            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1438            summary.coverage_functions_found += u64::from(cov.functions_found);
1439            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1440            summary.coverage_branches_found += u64::from(cov.branches_found);
1441            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1442        }
1443    }
1444
1445    summary
1446}
1447
1448/// Construct a zero-filled `LanguageSummary` for the given language.
1449const fn zeroed_summary(language: Language) -> LanguageSummary {
1450    LanguageSummary {
1451        language,
1452        files: 0,
1453        total_physical_lines: 0,
1454        code_lines: 0,
1455        comment_lines: 0,
1456        blank_lines: 0,
1457        mixed_lines_separate: 0,
1458        functions: 0,
1459        classes: 0,
1460        variables: 0,
1461        imports: 0,
1462        test_count: 0,
1463        test_assertion_count: 0,
1464        test_suite_count: 0,
1465        coverage_lines_found: 0,
1466        coverage_lines_hit: 0,
1467        coverage_functions_found: 0,
1468        coverage_functions_hit: 0,
1469        coverage_branches_found: 0,
1470        coverage_branches_hit: 0,
1471    }
1472}
1473
1474/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1475fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1476    entry.files += 1;
1477    let r = &record.raw_line_categories;
1478    entry.total_physical_lines += r.total_physical_lines;
1479    entry.code_lines += record.effective_counts.code_lines;
1480    entry.comment_lines += record.effective_counts.comment_lines;
1481    entry.blank_lines += record.effective_counts.blank_lines;
1482    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1483    entry.functions += r.functions;
1484    entry.classes += r.classes;
1485    entry.variables += r.variables;
1486    entry.imports += r.imports;
1487    entry.test_count += r.test_count;
1488    entry.test_assertion_count += r.test_assertion_count;
1489    entry.test_suite_count += r.test_suite_count;
1490    if let Some(cov) = &record.coverage {
1491        entry.coverage_lines_found += u64::from(cov.lines_found);
1492        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1493        entry.coverage_functions_found += u64::from(cov.functions_found);
1494        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1495        entry.coverage_branches_found += u64::from(cov.branches_found);
1496        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1497    }
1498}
1499
1500fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1501    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1502    for record in analyzed {
1503        let Some(language) = record.language else {
1504            continue;
1505        };
1506        let entry = by_language
1507            .entry(language)
1508            .or_insert_with(|| zeroed_summary(language));
1509        accumulate_record_into_summary(entry, record);
1510    }
1511    by_language.into_values().collect()
1512}
1513
1514fn skipped_record(
1515    path: &Path,
1516    root: &Path,
1517    size_bytes: u64,
1518    status: FileStatus,
1519    warnings: Vec<String>,
1520) -> FileRecord {
1521    FileRecord {
1522        path: path_to_string(path),
1523        relative_path: relative_path_string(path, root),
1524        language: None,
1525        size_bytes,
1526        detected_encoding: None,
1527        raw_line_categories: RawLineCounts::default(),
1528        effective_counts: EffectiveCounts::default(),
1529        status,
1530        warnings,
1531        generated: false,
1532        minified: false,
1533        vendor: false,
1534        parse_mode: None,
1535        submodule: None,
1536        coverage: None,
1537    }
1538}
1539
1540fn relative_path_string(path: &Path, root: &Path) -> String {
1541    path.strip_prefix(root)
1542        .unwrap_or(path)
1543        .to_string_lossy()
1544        .replace('\\', "/")
1545}
1546
1547fn path_to_string(path: &Path) -> String {
1548    path.to_string_lossy().replace('\\', "/")
1549}
1550
1551/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1552#[must_use]
1553pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1554    let gitmodules = root.join(".gitmodules");
1555    if !gitmodules.is_file() {
1556        return Vec::new();
1557    }
1558    let Ok(content) = fs::read_to_string(&gitmodules) else {
1559        return Vec::new();
1560    };
1561
1562    let mut result = Vec::new();
1563    let mut current_name: Option<String> = None;
1564    let mut current_path: Option<PathBuf> = None;
1565
1566    for line in content.lines() {
1567        let trimmed = line.trim();
1568        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1569            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1570                result.push((name, path));
1571            }
1572            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1573            current_name = Some(name);
1574        } else if let Some(rest) = trimmed.strip_prefix("path") {
1575            if let Some(eq_pos) = rest.find('=') {
1576                let path_str = rest[eq_pos + 1..].trim();
1577                current_path = Some(PathBuf::from(path_str));
1578            }
1579        }
1580    }
1581    if let (Some(name), Some(path)) = (current_name, current_path) {
1582        result.push((name, path));
1583    }
1584
1585    result
1586}
1587
1588fn build_submodule_summaries(
1589    analyzed: &[FileRecord],
1590    submodules: &[(String, PathBuf)],
1591) -> Vec<SubmoduleSummary> {
1592    submodules
1593        .iter()
1594        .map(|(name, path)| {
1595            let files: Vec<&FileRecord> = analyzed
1596                .iter()
1597                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1598                .collect();
1599
1600            let files_analyzed = files.len() as u64;
1601            let total_physical_lines = files
1602                .iter()
1603                .map(|f| f.raw_line_categories.total_physical_lines)
1604                .sum();
1605            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1606            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1607            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1608            let language_summaries = build_language_summaries_from_slice(&files);
1609
1610            SubmoduleSummary {
1611                name: name.clone(),
1612                relative_path: path.to_string_lossy().replace('\\', "/"),
1613                files_analyzed,
1614                total_physical_lines,
1615                code_lines,
1616                comment_lines,
1617                blank_lines,
1618                language_summaries,
1619            }
1620        })
1621        .filter(|s| s.files_analyzed > 0)
1622        .collect()
1623}
1624
1625fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1626    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1627    for file in files {
1628        let Some(lang) = file.language else { continue };
1629        let entry = map
1630            .entry(lang.display_name().to_string())
1631            .or_insert_with(|| zeroed_summary(lang));
1632        accumulate_record_into_summary(entry, file);
1633    }
1634    map.into_values().collect()
1635}
1636
1637fn file_name_eq(path: &Path, expected: &str) -> bool {
1638    path.file_name()
1639        .and_then(|name| name.to_str())
1640        .is_some_and(|name| name == expected)
1641}
1642
1643fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1644    path.components().any(|component| {
1645        component
1646            .as_os_str()
1647            .to_str()
1648            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1649    })
1650}
1651
1652fn is_vendor_path(path: &Path) -> bool {
1653    path.components().any(|component| {
1654        component
1655            .as_os_str()
1656            .to_str()
1657            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1658    })
1659}
1660
1661fn is_known_lockfile(path: &Path) -> bool {
1662    path.file_name()
1663        .and_then(|name| name.to_str())
1664        .is_some_and(|name| {
1665            matches!(
1666                name,
1667                "Cargo.lock"
1668                    | "package-lock.json"
1669                    | "yarn.lock"
1670                    | "pnpm-lock.yaml"
1671                    | "Pipfile.lock"
1672                    | "poetry.lock"
1673                    | "composer.lock"
1674            )
1675        })
1676}
1677
1678fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1679    let file_name = path
1680        .file_name()
1681        .and_then(|name| name.to_str())
1682        .unwrap_or_default();
1683    if file_name.contains(".generated.") || file_name.contains(".g.") {
1684        return true;
1685    }
1686
1687    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1688        .to_ascii_lowercase();
1689    sample.contains("@generated") || sample.contains("generated by")
1690}
1691
1692fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1693    let file_name = path
1694        .file_name()
1695        .and_then(|name| name.to_str())
1696        .unwrap_or_default();
1697    if file_name.contains(".min.") {
1698        return true;
1699    }
1700
1701    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1702    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1703    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1704    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1705}
1706
1707fn is_binary(bytes: &[u8]) -> bool {
1708    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1709        || bytes.starts_with(&[0xFF, 0xFE])
1710        || bytes.starts_with(&[0xFE, 0xFF])
1711    {
1712        return false;
1713    }
1714
1715    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1716    sample.contains(&0)
1717}
1718
1719/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
1720/// Returns `(text, encoding_label, warnings)`.
1721fn decode_utf16_bom(
1722    bom_stripped: &[u8],
1723    encoding: &'static encoding_rs::Encoding,
1724    label: &str,
1725) -> (String, String, Vec<String>) {
1726    let (cow, _, had_errors) = encoding.decode(bom_stripped);
1727    let mut warnings = Vec::new();
1728    if had_errors {
1729        warnings.push(format!("{label} decode contained replacement characters"));
1730    }
1731    (cow.into_owned(), label.into(), warnings)
1732}
1733
1734fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1735    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1736        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1737        return Ok((text, "utf-8-bom".into(), vec![]));
1738    }
1739    if bytes.starts_with(&[0xFF, 0xFE]) {
1740        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1741    }
1742    if bytes.starts_with(&[0xFE, 0xFF]) {
1743        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1744    }
1745
1746    // Multiple statements in the else branch make map_or_else awkward here.
1747    #[allow(clippy::option_if_let_else)]
1748    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1749        Ok((text, "utf-8".into(), vec![]))
1750    } else {
1751        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1752        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1753        if had_errors {
1754            warnings.push("fallback decode contained replacement characters".into());
1755        }
1756        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1757    }
1758}
1759
1760fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1761    if patterns.is_empty() {
1762        return Ok(None);
1763    }
1764
1765    let mut builder = GlobSetBuilder::new();
1766    for pattern in patterns {
1767        builder
1768            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1769    }
1770    Ok(Some(
1771        builder.build().context("failed to compile glob filters")?,
1772    ))
1773}
1774
1775fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1776    if enabled.is_empty() {
1777        return Ok(None);
1778    }
1779
1780    let supported = supported_languages();
1781    let mut set = BTreeSet::new();
1782    for name in enabled {
1783        let language = Language::from_name(name)
1784            .with_context(|| format!("unsupported language in config: {name}"))?;
1785        if !supported.contains(&language) {
1786            anyhow::bail!("language {name} is not supported in this build");
1787        }
1788        set.insert(language);
1789    }
1790    Ok(Some(set))
1791}
1792
1793/// # Errors
1794///
1795/// Returns an error if serialization fails or the output file cannot be written.
1796pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1797    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1798    fs::write(output_path, json)
1799        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1800}
1801
1802/// # Errors
1803///
1804/// Returns an error if the file cannot be read or the JSON cannot be parsed.
1805pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1806    let contents = fs::read_to_string(path)
1807        .with_context(|| format!("failed to read result file {}", path.display()))?;
1808    serde_json::from_str(&contents)
1809        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1810}
1811
1812#[cfg(test)]
1813mod tests {
1814    use super::*;
1815
1816    #[test]
1817    fn effective_counts_respect_code_only_policy() {
1818        let raw = RawLineCounts {
1819            code_only_lines: 2,
1820            single_comment_only_lines: 1,
1821            mixed_code_single_comment_lines: 3,
1822            docstring_comment_lines: 2,
1823            ..RawLineCounts::default()
1824        };
1825        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1826        assert_eq!(counts.code_lines, 5);
1827        assert_eq!(counts.comment_lines, 3);
1828    }
1829
1830    #[test]
1831    fn effective_counts_can_separate_mixed() {
1832        let raw = RawLineCounts {
1833            mixed_code_single_comment_lines: 2,
1834            mixed_code_multi_comment_lines: 1,
1835            ..RawLineCounts::default()
1836        };
1837        let counts =
1838            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1839        assert_eq!(counts.mixed_lines_separate, 3);
1840        assert_eq!(counts.code_lines, 0);
1841        assert_eq!(counts.comment_lines, 0);
1842    }
1843
1844    #[test]
1845    fn windows_1252_fallback_decodes() {
1846        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1847        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1848        assert_eq!(encoding, "windows-1252");
1849        assert!(text.contains('–'));
1850        assert!(!warnings.is_empty());
1851    }
1852}