Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot, WatchedDirsStore};
13
14use std::collections::{BTreeMap, BTreeSet, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18
19use anyhow::{Context, Result};
20use chrono::{DateTime, Utc};
21use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
22use globset::{Glob, GlobSet, GlobSetBuilder};
23use ignore::WalkBuilder;
24use serde::{Deserialize, Serialize};
25use uuid::Uuid;
26
27use sloc_config::{
28    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
29    FailureBehavior, MixedLinePolicy,
30};
31use sloc_languages::{
32    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
33    RawLineCounts,
34};
35
36// ── Detection sample sizes and thresholds ────────────────────────────────────
37
38/// Maximum number of worker threads used for parallel file analysis.
39const MAX_ANALYSIS_THREADS: usize = 16;
40/// Fallback thread count when `available_parallelism` is unavailable.
41const DEFAULT_ANALYSIS_THREADS: usize = 4;
42/// Byte sample used to detect `@generated` markers.
43const GENERATED_SAMPLE_BYTES: usize = 1024;
44/// Byte sample used to detect minified files via line-length heuristic.
45const MINIFIED_SAMPLE_BYTES: usize = 4096;
46/// Longest line length above which a file is considered minified.
47const MINIFIED_LINE_THRESHOLD: usize = 2000;
48/// Byte sample used to detect binary files via null-byte scan.
49const BINARY_SAMPLE_BYTES: usize = 8192;
50
51/// Three-way outcome for metadata-level policy checks.
52enum MetadataPolicyOutcome {
53    /// Skip this file — include the record in output.
54    Skip(Box<FileRecord>),
55    /// Exclude this file entirely — no record in output (include-glob miss).
56    Exclude,
57    /// Continue to content checks.
58    Continue,
59}
60
61#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum FileStatus {
64    AnalyzedExact,
65    AnalyzedBestEffort,
66    SkippedBinary,
67    SkippedDecodeError,
68    SkippedUnsupported,
69    SkippedByPolicy,
70    ErrorInternal,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, Default)]
74pub struct EffectiveCounts {
75    pub code_lines: u64,
76    pub comment_lines: u64,
77    pub blank_lines: u64,
78    pub mixed_lines_separate: u64,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ToolMetadata {
83    pub name: String,
84    pub version: String,
85    pub run_id: String,
86    pub timestamp_utc: DateTime<Utc>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct EnvironmentMetadata {
91    pub operating_system: String,
92    pub architecture: String,
93    pub runtime_mode: String,
94    pub initiator_username: String,
95    pub initiator_hostname: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, Default)]
99pub struct SummaryTotals {
100    pub files_considered: u64,
101    pub files_analyzed: u64,
102    pub files_skipped: u64,
103    pub total_physical_lines: u64,
104    pub code_lines: u64,
105    pub comment_lines: u64,
106    pub blank_lines: u64,
107    pub mixed_lines_separate: u64,
108    #[serde(default)]
109    pub functions: u64,
110    #[serde(default)]
111    pub classes: u64,
112    #[serde(default)]
113    pub variables: u64,
114    #[serde(default)]
115    pub imports: u64,
116    #[serde(default)]
117    pub test_count: u64,
118    /// Lexically detected test assertion call lines across all analyzed files.
119    #[serde(default)]
120    pub test_assertion_count: u64,
121    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
122    #[serde(default)]
123    pub test_suite_count: u64,
124    /// Aggregated from LCOV data when provided.
125    #[serde(default)]
126    pub coverage_lines_found: u64,
127    #[serde(default)]
128    pub coverage_lines_hit: u64,
129    #[serde(default)]
130    pub coverage_functions_found: u64,
131    #[serde(default)]
132    pub coverage_functions_hit: u64,
133    #[serde(default)]
134    pub coverage_branches_found: u64,
135    #[serde(default)]
136    pub coverage_branches_hit: u64,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageSummary {
141    pub language: Language,
142    pub files: u64,
143    pub total_physical_lines: u64,
144    pub code_lines: u64,
145    pub comment_lines: u64,
146    pub blank_lines: u64,
147    pub mixed_lines_separate: u64,
148    #[serde(default)]
149    pub functions: u64,
150    #[serde(default)]
151    pub classes: u64,
152    #[serde(default)]
153    pub variables: u64,
154    #[serde(default)]
155    pub imports: u64,
156    #[serde(default)]
157    pub test_count: u64,
158    #[serde(default)]
159    pub test_assertion_count: u64,
160    #[serde(default)]
161    pub test_suite_count: u64,
162    #[serde(default)]
163    pub coverage_lines_found: u64,
164    #[serde(default)]
165    pub coverage_lines_hit: u64,
166    #[serde(default)]
167    pub coverage_functions_found: u64,
168    #[serde(default)]
169    pub coverage_functions_hit: u64,
170    #[serde(default)]
171    pub coverage_branches_found: u64,
172    #[serde(default)]
173    pub coverage_branches_hit: u64,
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct FileRecord {
178    pub path: String,
179    pub relative_path: String,
180    pub language: Option<Language>,
181    pub size_bytes: u64,
182    pub detected_encoding: Option<String>,
183    pub raw_line_categories: RawLineCounts,
184    pub effective_counts: EffectiveCounts,
185    pub status: FileStatus,
186    pub warnings: Vec<String>,
187    pub generated: bool,
188    pub minified: bool,
189    pub vendor: bool,
190    pub parse_mode: Option<ParseMode>,
191    #[serde(skip_serializing_if = "Option::is_none")]
192    pub submodule: Option<String>,
193    /// Line/function/branch coverage from an external LCOV file, when provided.
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub coverage: Option<FileCoverage>,
196}
197
198/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct SubmoduleSummary {
201    pub name: String,
202    pub relative_path: String,
203    pub files_analyzed: u64,
204    pub total_physical_lines: u64,
205    pub code_lines: u64,
206    pub comment_lines: u64,
207    pub blank_lines: u64,
208    pub language_summaries: Vec<LanguageSummary>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct AnalysisRun {
213    pub tool: ToolMetadata,
214    pub environment: EnvironmentMetadata,
215    pub effective_configuration: AppConfig,
216    pub input_roots: Vec<String>,
217    pub summary_totals: SummaryTotals,
218    pub totals_by_language: Vec<LanguageSummary>,
219    pub per_file_records: Vec<FileRecord>,
220    pub skipped_file_records: Vec<FileRecord>,
221    pub warnings: Vec<String>,
222    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
223    #[serde(default, skip_serializing_if = "Vec::is_empty")]
224    pub submodule_summaries: Vec<SubmoduleSummary>,
225    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub git_commit_short: Option<String>,
228    /// Full git commit SHA at scan time, if the project is a git repo.
229    #[serde(default, skip_serializing_if = "Option::is_none")]
230    pub git_commit_long: Option<String>,
231    /// Git branch active at scan time, if the project is a git repo.
232    #[serde(default, skip_serializing_if = "Option::is_none")]
233    pub git_branch: Option<String>,
234    /// Author of the last git commit at scan time.
235    #[serde(default, skip_serializing_if = "Option::is_none")]
236    pub git_commit_author: Option<String>,
237    /// Comma-separated git tags pointing at HEAD at scan time.
238    #[serde(default, skip_serializing_if = "Option::is_none")]
239    pub git_tags: Option<String>,
240    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
241    #[serde(default, skip_serializing_if = "Option::is_none")]
242    pub git_nearest_tag: Option<String>,
243    /// ISO 8601 author-date of the last git commit at scan time.
244    #[serde(default, skip_serializing_if = "Option::is_none")]
245    pub git_commit_date: Option<String>,
246}
247
248#[derive(Default)]
249struct GitInfo {
250    commit_short: Option<String>,
251    commit_long: Option<String>,
252    branch: Option<String>,
253    author: Option<String>,
254    tags: Option<String>,
255    nearest_tag: Option<String>,
256    commit_date: Option<String>,
257}
258
259/// Locate the `.git` directory by walking up from `start`.
260/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
261/// submodules. Returns `None` if no git repo is found.
262fn find_git_dir(start: &Path) -> Option<PathBuf> {
263    let mut current = Some(start);
264    while let Some(dir) = current {
265        let candidate = dir.join(".git");
266        if candidate.is_dir() {
267            return Some(candidate);
268        }
269        if candidate.is_file() {
270            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
271                return Some(resolved);
272            }
273        }
274        current = dir.parent();
275    }
276    None
277}
278
279/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
280/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
281/// or if the resolved path is not an existing directory.
282fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
283    let content = fs::read_to_string(file).ok()?;
284    let ptr = content.trim().strip_prefix("gitdir: ")?;
285    // Normalise forward-slash paths to the OS separator so that Path operations
286    // (join, exists, canonicalize) work correctly on Windows.
287    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
288    let resolved = if Path::new(&ptr_native).is_absolute() {
289        PathBuf::from(&ptr_native)
290    } else {
291        base_dir.join(&ptr_native)
292    };
293    // canonicalize resolves ".." components and symlinks; fall back to the
294    // un-canonicalized path if it fails (e.g. some Windows configurations
295    // return a UNC "\\?\" prefix that confuses later path operations).
296    let final_path = resolved.canonicalize().unwrap_or(resolved);
297    if final_path.is_dir() {
298        Some(final_path)
299    } else {
300        None
301    }
302}
303
304/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
305/// Checks loose ref files first, then `packed-refs`.
306fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
307    // Build the OS-native path to the loose ref file by joining each
308    // forward-slash component individually.  This produces the correct
309    // separator on every platform without any manual replacement.
310    let ref_path = refname
311        .split('/')
312        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
313    if ref_path.exists() {
314        let sha = fs::read_to_string(&ref_path)
315            .ok()
316            .map(|s| s.trim().to_string())
317            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
318        if sha.is_some() {
319            return sha;
320        }
321    }
322    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
323    // comments; lines starting with '^' are peeled tag objects to skip).
324    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
325    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
326    for line in packed.lines() {
327        if line.starts_with('#') || line.starts_with('^') {
328            continue;
329        }
330        let mut cols = line.splitn(2, ' ');
331        let sha = cols.next()?;
332        let name = cols.next()?.trim();
333        if name == refname {
334            return Some(sha.to_string());
335        }
336    }
337    None
338}
339
340/// Parse the last entry of `.git/logs/HEAD` to get the commit author name and
341/// author-date in ISO 8601 format.
342///
343/// Reflog line format:
344/// `<old-sha> <new-sha> Author Name <email> <unix-ts> <tz-offset>\t<message>`
345fn parse_last_reflog_entry(git_dir: &Path) -> (Option<String>, Option<String>) {
346    let log_path = git_dir.join("logs").join("HEAD");
347    let Ok(content) = fs::read_to_string(&log_path) else {
348        return (None, None);
349    };
350    let Some(last) = content.lines().rfind(|l| !l.trim().is_empty()) else {
351        return (None, None);
352    };
353
354    // Skip the two 40-char SHAs + their separating spaces
355    // (an initial commit shows 0000... as old-sha, still 40 chars)
356    let Some(after_shas) = last.splitn(3, ' ').nth(2) else {
357        return (None, None);
358    };
359
360    // Author name ends just before " <email>"
361    let author = after_shas.find(" <").map(|i| after_shas[..i].to_string());
362
363    // Timestamp is the number after the closing ">"
364    let date = (|| {
365        use chrono::TimeZone as _;
366        let close = after_shas.find("> ")?;
367        let rest = after_shas[close + 2..].trim_start();
368        let mut tokens = rest.splitn(3, ' ');
369        let unix_str = tokens.next()?;
370        let offset_str = tokens.next().map(|s| s.split('\t').next().unwrap_or(s))?;
371        let ts: i64 = unix_str.parse().ok()?;
372        let dt = chrono::Utc.timestamp_opt(ts, 0).single()?;
373        // Format as ISO 8601 with timezone offset, e.g. 2026-05-17T12:51:54-07:00
374        let tz_display = if offset_str.len() == 5 {
375            format!("{}:{}", &offset_str[..3], &offset_str[3..])
376        } else {
377            offset_str.to_string()
378        };
379        Some(format!("{}{}", dt.format("%Y-%m-%dT%H:%M:%S"), tz_display))
380    })();
381
382    (author, date)
383}
384
385/// Detect git metadata by reading `.git/` files directly — no `git` executable
386/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
387/// reflogs.
388fn detect_git_for_run(project_path: &Path) -> GitInfo {
389    let Some(git_dir) = find_git_dir(project_path) else {
390        return GitInfo::default();
391    };
392
393    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
394        Ok(s) => s.trim().to_string(),
395        Err(_) => return GitInfo::default(),
396    };
397
398    let (branch, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
399        || {
400            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
401                // Detached HEAD — the HEAD file itself is the commit SHA
402                (None, Some(head_raw[..40].to_string()))
403            } else {
404                (None, None)
405            }
406        },
407        |refname| {
408            let branch = refname
409                .strip_prefix("refs/heads/")
410                .map(|b| b.trim().to_string());
411            let sha = resolve_ref(&git_dir, refname.trim());
412            (branch, sha)
413        },
414    );
415
416    let commit_short = commit_long
417        .as_deref()
418        .map(|s| s.chars().take(7).collect::<String>());
419
420    let (author, commit_date) = parse_last_reflog_entry(&git_dir);
421
422    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
423    // but don't block on it. If git isn't available these will simply be None.
424    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
425        t.lines()
426            .filter(|l| !l.is_empty())
427            .collect::<Vec<_>>()
428            .join(", ")
429    });
430    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
431
432    GitInfo {
433        commit_short,
434        commit_long,
435        branch,
436        author,
437        tags,
438        nearest_tag,
439        commit_date,
440    }
441}
442
443/// Run a git command as a best-effort supplemental source.  Not used for the
444/// core commit/branch/author fields — those come from direct file reads above.
445fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
446    // Try the bare name first (works when git is on PATH), then fall back to
447    // absolute paths for service accounts that run with a stripped PATH.
448    // Unix paths silently fail on Windows and vice-versa.
449    let candidates: &[&str] = &[
450        // Works on all platforms when git is on PATH
451        "git",
452        // Common Linux / macOS install locations
453        "/usr/bin/git",
454        "/usr/local/bin/git",
455        "/opt/homebrew/bin/git",
456        // Git for Windows default installation paths
457        r"C:\Program Files\Git\cmd\git.exe",
458        r"C:\Program Files\Git\bin\git.exe",
459        r"C:\Program Files (x86)\Git\cmd\git.exe",
460    ];
461    for &exe in candidates {
462        let result = std::process::Command::new(exe)
463            .args(["-c", "safe.directory=*"])
464            .args(args)
465            .current_dir(dir)
466            .output()
467            .ok()
468            .filter(|o| o.status.success())
469            .and_then(|o| String::from_utf8(o.stdout).ok())
470            .map(|s| s.trim().to_string())
471            .filter(|s| !s.is_empty());
472        if result.is_some() {
473            return result;
474        }
475    }
476    None
477}
478
479fn get_current_username() -> String {
480    std::env::var("USERNAME")
481        .or_else(|_| std::env::var("USER"))
482        .unwrap_or_else(|_| "unknown".to_string())
483}
484
485fn get_hostname() -> String {
486    std::env::var("COMPUTERNAME")
487        .or_else(|_| std::env::var("HOSTNAME"))
488        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
489        .unwrap_or_else(|_| "unknown".to_string())
490}
491
492/// Walk a single directory root and collect file records into the output vectors.
493#[allow(clippy::too_many_arguments)]
494fn walk_root(
495    root: &Path,
496    config: &AppConfig,
497    include_globs: Option<&GlobSet>,
498    exclude_globs: Option<&GlobSet>,
499    enabled_languages: Option<&BTreeSet<Language>>,
500    seen_paths: &mut HashSet<PathBuf>,
501    analyzed: &mut Vec<FileRecord>,
502    skipped: &mut Vec<FileRecord>,
503    warnings: &mut Vec<String>,
504    cancel: Option<&AtomicBool>,
505) -> Result<()> {
506    let mut builder = WalkBuilder::new(root);
507    builder
508        .follow_links(config.discovery.follow_symlinks)
509        .hidden(config.discovery.ignore_hidden_files)
510        .ignore(config.discovery.honor_ignore_files)
511        .parents(config.discovery.honor_ignore_files)
512        .git_ignore(config.discovery.honor_ignore_files)
513        .git_global(config.discovery.honor_ignore_files)
514        .git_exclude(config.discovery.honor_ignore_files);
515
516    let paths = collect_walk_paths(&builder, seen_paths, warnings);
517    if paths.is_empty() {
518        return Ok(());
519    }
520
521    let chunk_results = run_parallel_analysis(
522        &paths,
523        root,
524        config,
525        include_globs,
526        exclude_globs,
527        enabled_languages,
528        cancel,
529    )?;
530    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
531}
532
533fn collect_walk_paths(
534    builder: &WalkBuilder,
535    seen_paths: &mut HashSet<PathBuf>,
536    warnings: &mut Vec<String>,
537) -> Vec<PathBuf> {
538    let mut paths = Vec::new();
539    for entry in builder.build() {
540        let entry = match entry {
541            Ok(e) => e,
542            Err(err) => {
543                warnings.push(format!("discovery warning: {err}"));
544                continue;
545            }
546        };
547        let path = entry.into_path();
548        if path.is_dir() || !seen_paths.insert(path.clone()) {
549            continue;
550        }
551        paths.push(path);
552    }
553    paths
554}
555
556#[allow(clippy::too_many_arguments)]
557fn run_parallel_analysis(
558    paths: &[PathBuf],
559    root: &Path,
560    config: &AppConfig,
561    include_globs: Option<&GlobSet>,
562    exclude_globs: Option<&GlobSet>,
563    enabled_languages: Option<&BTreeSet<Language>>,
564    cancel: Option<&AtomicBool>,
565) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
566    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
567        n.get().min(MAX_ANALYSIS_THREADS)
568    });
569    let chunk_size = paths.len().div_ceil(thread_count);
570    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
571        paths
572            .chunks(chunk_size)
573            .map(|chunk| {
574                s.spawn(move || -> Vec<Result<Option<FileRecord>>> {
575                    let mut results = Vec::with_capacity(chunk.len());
576                    for path in chunk {
577                        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
578                            results.push(Err(anyhow::anyhow!("analysis cancelled")));
579                            break;
580                        }
581                        results.push(analyze_candidate_file(
582                            path,
583                            root,
584                            config,
585                            include_globs,
586                            exclude_globs,
587                            enabled_languages,
588                        ));
589                    }
590                    results
591                })
592            })
593            .map(|h| {
594                h.join()
595                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
596            })
597            .collect()
598    })
599}
600
601fn merge_chunk_results(
602    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
603    analyzed: &mut Vec<FileRecord>,
604    skipped: &mut Vec<FileRecord>,
605    warnings: &mut Vec<String>,
606) -> Result<()> {
607    for chunk in chunk_results {
608        for result in chunk {
609            if let Some(record) = result? {
610                push_record(record, analyzed, skipped, warnings);
611            }
612        }
613    }
614    Ok(())
615}
616
617/// Label each analyzed file with its submodule and build per-submodule summaries.
618fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
619    let root = config.discovery.root_paths[0]
620        .canonicalize()
621        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
622    let submodules = detect_submodules(&root);
623    if submodules.is_empty() {
624        return Vec::new();
625    }
626
627    for file in analyzed.iter_mut() {
628        for (name, sub_path) in &submodules {
629            let prefix = sub_path.to_string_lossy().replace('\\', "/");
630            let rel = &file.relative_path;
631            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
632                file.submodule = Some(name.clone());
633                break;
634            }
635        }
636    }
637
638    build_submodule_summaries(analyzed, &submodules)
639}
640
641/// Assemble the final `AnalysisRun` from collected records and metadata.
642fn assemble_run(
643    config: &AppConfig,
644    runtime_mode: &str,
645    analyzed: Vec<FileRecord>,
646    skipped: Vec<FileRecord>,
647    warnings: Vec<String>,
648    submodule_summaries: Vec<SubmoduleSummary>,
649) -> AnalysisRun {
650    let summary = build_summary(&analyzed, &skipped);
651    let language_summaries = build_language_summaries(&analyzed);
652
653    let first_root = config
654        .discovery
655        .root_paths
656        .first()
657        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
658    let git = first_root
659        .as_deref()
660        .map(detect_git_for_run)
661        .unwrap_or_default();
662
663    let now = Utc::now();
664    let run_id = {
665        let uuid_suffix = Uuid::new_v4().simple().to_string();
666        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
667    };
668
669    AnalysisRun {
670        tool: ToolMetadata {
671            name: "sloc".into(),
672            version: env!("CARGO_PKG_VERSION").into(),
673            run_id,
674            timestamp_utc: now,
675        },
676        environment: EnvironmentMetadata {
677            operating_system: std::env::consts::OS.into(),
678            architecture: std::env::consts::ARCH.into(),
679            runtime_mode: runtime_mode.into(),
680            initiator_username: get_current_username(),
681            initiator_hostname: get_hostname(),
682        },
683        effective_configuration: config.clone(),
684        input_roots: config
685            .discovery
686            .root_paths
687            .iter()
688            .map(|p| path_to_string(p))
689            .collect(),
690        summary_totals: summary,
691        totals_by_language: language_summaries,
692        per_file_records: analyzed,
693        skipped_file_records: skipped,
694        warnings,
695        submodule_summaries,
696        git_commit_short: git.commit_short,
697        git_commit_long: git.commit_long,
698        git_branch: git.branch,
699        git_commit_author: git.author,
700        git_tags: git.tags,
701        git_nearest_tag: git.nearest_tag,
702        git_commit_date: git.commit_date,
703    }
704}
705
706/// # Errors
707///
708/// Returns an error if the config is invalid, root paths cannot be walked, or any file
709/// analysis step fails in a way that cannot be recovered from.
710#[allow(clippy::too_many_lines)]
711pub fn analyze(
712    config: &AppConfig,
713    runtime_mode: &str,
714    cancel: Option<&AtomicBool>,
715) -> Result<AnalysisRun> {
716    config.validate()?;
717
718    if config.discovery.root_paths.is_empty() {
719        anyhow::bail!("no input paths were provided");
720    }
721
722    let include_globs = compile_globset(&config.discovery.include_globs)?;
723    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
724    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
725
726    let mut analyzed = Vec::new();
727    let mut skipped = Vec::new();
728    let mut warnings = Vec::new();
729    let mut seen_paths = HashSet::new();
730
731    for root in &config.discovery.root_paths {
732        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
733            anyhow::bail!("analysis cancelled");
734        }
735
736        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
737
738        if root.is_file() {
739            if let Some(record) = analyze_candidate_file(
740                &root,
741                root.parent().unwrap_or_else(|| Path::new(".")),
742                config,
743                include_globs.as_ref(),
744                exclude_globs.as_ref(),
745                enabled_languages.as_ref(),
746            )? {
747                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
748            }
749            continue;
750        }
751
752        walk_root(
753            &root,
754            config,
755            include_globs.as_ref(),
756            exclude_globs.as_ref(),
757            enabled_languages.as_ref(),
758            &mut seen_paths,
759            &mut analyzed,
760            &mut skipped,
761            &mut warnings,
762            cancel,
763        )?;
764    }
765
766    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
767    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
768
769    // Submodule detection: label each file with its submodule and build per-submodule summaries.
770    let submodule_summaries = if config.discovery.submodule_breakdown {
771        process_submodules(config, &mut analyzed)
772    } else {
773        Vec::new()
774    };
775
776    attach_coverage(config, &mut analyzed, &mut warnings);
777
778    Ok(assemble_run(
779        config,
780        runtime_mode,
781        analyzed,
782        skipped,
783        warnings,
784        submodule_summaries,
785    ))
786}
787
788fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
789    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
790    else {
791        return;
792    };
793    match fs::read_to_string(&cov_path) {
794        Ok(content) => {
795            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
796            for record in analyzed.iter_mut() {
797                record.coverage =
798                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
799            }
800        }
801        Err(e) => {
802            warnings.push(format!(
803                "coverage file '{}' could not be read: {e}",
804                cov_path.display()
805            ));
806        }
807    }
808}
809
810fn push_record(
811    record: FileRecord,
812    analyzed: &mut Vec<FileRecord>,
813    skipped: &mut Vec<FileRecord>,
814    warnings: &mut Vec<String>,
815) {
816    warnings.extend(
817        record
818            .warnings
819            .iter()
820            .map(|warning| format!("{}: {warning}", record.relative_path)),
821    );
822
823    match record.status {
824        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
825        _ => skipped.push(record),
826    }
827}
828
829/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
830#[inline]
831fn skip_with_reason(
832    path: &Path,
833    root: &Path,
834    size: u64,
835    reason: impl Into<String>,
836) -> MetadataPolicyOutcome {
837    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
838        path,
839        root,
840        size,
841        FileStatus::SkippedByPolicy,
842        vec![reason.into()],
843    )))
844}
845
846/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
847/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
848/// or `Continue` to proceed to content checks.
849#[allow(clippy::too_many_arguments)]
850fn check_metadata_policy(
851    path: &Path,
852    root: &Path,
853    relative_path: &str,
854    metadata: &fs::Metadata,
855    config: &AppConfig,
856    include_globs: Option<&GlobSet>,
857    exclude_globs: Option<&GlobSet>,
858) -> MetadataPolicyOutcome {
859    let size = metadata.len();
860
861    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
862        return skip_with_reason(path, root, size, "symlink skipped by policy");
863    }
864    if file_name_eq(path, ".gitignore") {
865        return skip_with_reason(path, root, size, ".gitignore is always excluded");
866    }
867    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
868        return skip_with_reason(path, root, size, "path matched excluded directory setting");
869    }
870    if size > config.discovery.max_file_size_bytes {
871        return skip_with_reason(
872            path,
873            root,
874            size,
875            format!(
876                "file exceeded max_file_size_bytes ({})",
877                config.discovery.max_file_size_bytes
878            ),
879        );
880    }
881    if let Some(globs) = include_globs {
882        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
883            return MetadataPolicyOutcome::Exclude;
884        }
885    }
886    if let Some(globs) = exclude_globs {
887        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
888            return skip_with_reason(path, root, size, "path matched exclude glob");
889        }
890    }
891    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
892        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
893    }
894
895    MetadataPolicyOutcome::Continue
896}
897
898struct ContentPolicyResult {
899    vendor: bool,
900    generated: bool,
901    minified: bool,
902    skip_record: Option<FileRecord>,
903}
904
905/// Apply content-level policy checks (vendor, generated, minified).
906/// `skip_record` is `Some` when the file should be skipped.
907fn check_content_policy(
908    path: &Path,
909    root: &Path,
910    size_bytes: u64,
911    bytes: &[u8],
912    config: &AppConfig,
913) -> ContentPolicyResult {
914    let vendor = is_vendor_path(path);
915    if vendor && config.analysis.vendor_directory_detection {
916        return ContentPolicyResult {
917            vendor,
918            generated: false,
919            minified: false,
920            skip_record: Some(skipped_record(
921                path,
922                root,
923                size_bytes,
924                FileStatus::SkippedByPolicy,
925                vec!["vendor file skipped by policy".into()],
926            )),
927        };
928    }
929
930    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
931    if generated {
932        return ContentPolicyResult {
933            vendor,
934            generated,
935            minified: false,
936            skip_record: Some(skipped_record(
937                path,
938                root,
939                size_bytes,
940                FileStatus::SkippedByPolicy,
941                vec!["generated file skipped by policy".into()],
942            )),
943        };
944    }
945
946    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
947    if minified {
948        return ContentPolicyResult {
949            vendor,
950            generated,
951            minified,
952            skip_record: Some(skipped_record(
953                path,
954                root,
955                size_bytes,
956                FileStatus::SkippedByPolicy,
957                vec!["minified file skipped by policy".into()],
958            )),
959        };
960    }
961
962    ContentPolicyResult {
963        vendor,
964        generated,
965        minified,
966        skip_record: None,
967    }
968}
969
970/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
971fn decode_file_contents(
972    path: &Path,
973    root: &Path,
974    size_bytes: u64,
975    bytes: &[u8],
976    config: &AppConfig,
977) -> Result<Option<(String, String, Vec<String>)>> {
978    if is_binary(bytes) {
979        return match config.analysis.binary_file_behavior {
980            BinaryFileBehavior::Skip => Ok(None),
981            BinaryFileBehavior::Fail => {
982                anyhow::bail!("binary file encountered: {}", path.display())
983            }
984        };
985    }
986
987    match decode_bytes(bytes) {
988        Ok(result) => Ok(Some(result)),
989        Err(err) => match config.analysis.decode_failure_behavior {
990            FailureBehavior::WarnSkip => {
991                // Caller will handle the None as a SkippedDecodeError record.
992                // We use a sentinel: return Ok(None) but encode the error into a field.
993                // Instead, propagate as a skipped record via the caller.
994                let _ = (path, root, size_bytes); // suppress unused warnings
995                Err(anyhow::anyhow!("__decode_warn__: {err}"))
996            }
997            FailureBehavior::Fail => {
998                anyhow::bail!("decode failure for {}: {err}", path.display())
999            }
1000        },
1001    }
1002}
1003
1004#[allow(clippy::too_many_lines)]
1005fn analyze_candidate_file(
1006    path: &Path,
1007    root: &Path,
1008    config: &AppConfig,
1009    include_globs: Option<&GlobSet>,
1010    exclude_globs: Option<&GlobSet>,
1011    enabled_languages: Option<&BTreeSet<Language>>,
1012) -> Result<Option<FileRecord>> {
1013    let metadata = match fs::symlink_metadata(path) {
1014        Ok(metadata) => metadata,
1015        Err(err) => {
1016            return Ok(Some(skipped_record(
1017                path,
1018                root,
1019                0,
1020                FileStatus::ErrorInternal,
1021                vec![format!("failed to read metadata: {err}")],
1022            )));
1023        }
1024    };
1025
1026    let relative_path = relative_path_string(path, root);
1027
1028    // Metadata-level policy checks.
1029    match check_metadata_policy(
1030        path,
1031        root,
1032        &relative_path,
1033        &metadata,
1034        config,
1035        include_globs,
1036        exclude_globs,
1037    ) {
1038        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1039        MetadataPolicyOutcome::Exclude => return Ok(None),
1040        MetadataPolicyOutcome::Continue => {}
1041    }
1042
1043    let bytes = match fs::read(path) {
1044        Ok(bytes) => bytes,
1045        Err(err) => {
1046            return Ok(Some(skipped_record(
1047                path,
1048                root,
1049                metadata.len(),
1050                FileStatus::ErrorInternal,
1051                vec![format!("failed to read file: {err}")],
1052            )));
1053        }
1054    };
1055
1056    // Content-level policy checks (vendor, generated, minified).
1057    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1058    if let Some(record) = content_policy.skip_record {
1059        return Ok(Some(record));
1060    }
1061    let (vendor, generated, minified) = (
1062        content_policy.vendor,
1063        content_policy.generated,
1064        content_policy.minified,
1065    );
1066
1067    // Decode content, handling binary and decode failures.
1068    let (text, encoding, decode_warnings) =
1069        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1070            Ok(Some(result)) => result,
1071            Ok(None) => {
1072                return Ok(Some(skipped_record(
1073                    path,
1074                    root,
1075                    metadata.len(),
1076                    FileStatus::SkippedBinary,
1077                    vec!["binary file skipped by default".into()],
1078                )));
1079            }
1080            Err(err) => {
1081                let msg = err.to_string();
1082                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1083                    return Ok(Some(skipped_record(
1084                        path,
1085                        root,
1086                        metadata.len(),
1087                        FileStatus::SkippedDecodeError,
1088                        vec![warn_msg.to_string()],
1089                    )));
1090                }
1091                return Err(err);
1092            }
1093        };
1094
1095    let first_line = text.lines().next();
1096    let language = detect_language(
1097        path,
1098        first_line,
1099        &config.analysis.extension_overrides,
1100        config.analysis.shebang_detection,
1101    );
1102
1103    let Some(language) = language else {
1104        return Ok(Some(skipped_record(
1105            path,
1106            root,
1107            metadata.len(),
1108            FileStatus::SkippedUnsupported,
1109            vec!["unsupported or undetected language".into()],
1110        )));
1111    };
1112
1113    if let Some(enabled) = enabled_languages {
1114        if !enabled.contains(&language) {
1115            return Ok(Some(skipped_record(
1116                path,
1117                root,
1118                metadata.len(),
1119                FileStatus::SkippedByPolicy,
1120                vec![format!(
1121                    "language {} disabled by configuration",
1122                    language.display_name()
1123                )],
1124            )));
1125        }
1126    }
1127
1128    let ieee_opts = AnalysisOptions {
1129        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1130            == BlankInBlockCommentPolicy::CountAsComment,
1131        collapse_continuation_lines: config.analysis.continuation_line_policy
1132            == ContinuationLinePolicy::CollapseToLogical,
1133    };
1134    let analysis = analyze_text(language, &text, ieee_opts);
1135    let effective_counts = compute_effective_counts(
1136        &analysis.raw,
1137        config.analysis.mixed_line_policy,
1138        config.analysis.python_docstrings_as_comments,
1139        config.analysis.count_compiler_directives,
1140    );
1141
1142    let mut warnings = decode_warnings;
1143    warnings.extend(analysis.warnings.clone());
1144
1145    Ok(Some(FileRecord {
1146        path: path_to_string(path),
1147        relative_path,
1148        language: Some(language),
1149        size_bytes: metadata.len(),
1150        detected_encoding: Some(encoding),
1151        raw_line_categories: analysis.raw,
1152        effective_counts,
1153        status: match analysis.parse_mode {
1154            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1155            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1156        },
1157        warnings,
1158        generated,
1159        minified,
1160        vendor,
1161        parse_mode: Some(analysis.parse_mode),
1162        submodule: None,
1163        coverage: None,
1164    }))
1165}
1166
1167const fn compute_effective_counts(
1168    raw: &RawLineCounts,
1169    mixed_line_policy: MixedLinePolicy,
1170    python_docstrings_as_comments: bool,
1171    count_compiler_directives: bool,
1172) -> EffectiveCounts {
1173    let mut effective = EffectiveCounts {
1174        code_lines: raw.code_only_lines,
1175        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1176        blank_lines: raw.blank_only_lines,
1177        mixed_lines_separate: 0,
1178    };
1179
1180    if python_docstrings_as_comments {
1181        effective.comment_lines += raw.docstring_comment_lines;
1182    } else {
1183        effective.code_lines += raw.docstring_comment_lines;
1184    }
1185
1186    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1187    match mixed_line_policy {
1188        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1189        MixedLinePolicy::CodeAndComment => {
1190            effective.code_lines += mixed_total;
1191            effective.comment_lines += mixed_total;
1192        }
1193        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1194        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1195    }
1196
1197    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1198    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1199    if !count_compiler_directives {
1200        effective.code_lines = effective
1201            .code_lines
1202            .saturating_sub(raw.compiler_directive_lines);
1203    }
1204
1205    effective
1206}
1207
1208fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1209    let mut summary = SummaryTotals {
1210        files_considered: (analyzed.len() + skipped.len()) as u64,
1211        files_analyzed: analyzed.len() as u64,
1212        files_skipped: skipped.len() as u64,
1213        ..Default::default()
1214    };
1215
1216    for record in analyzed {
1217        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1218        summary.code_lines += record.effective_counts.code_lines;
1219        summary.comment_lines += record.effective_counts.comment_lines;
1220        summary.blank_lines += record.effective_counts.blank_lines;
1221        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1222        summary.functions += record.raw_line_categories.functions;
1223        summary.classes += record.raw_line_categories.classes;
1224        summary.variables += record.raw_line_categories.variables;
1225        summary.imports += record.raw_line_categories.imports;
1226        summary.test_count += record.raw_line_categories.test_count;
1227        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1228        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1229        if let Some(cov) = &record.coverage {
1230            summary.coverage_lines_found += u64::from(cov.lines_found);
1231            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1232            summary.coverage_functions_found += u64::from(cov.functions_found);
1233            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1234            summary.coverage_branches_found += u64::from(cov.branches_found);
1235            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1236        }
1237    }
1238
1239    summary
1240}
1241
1242/// Construct a zero-filled `LanguageSummary` for the given language.
1243const fn zeroed_summary(language: Language) -> LanguageSummary {
1244    LanguageSummary {
1245        language,
1246        files: 0,
1247        total_physical_lines: 0,
1248        code_lines: 0,
1249        comment_lines: 0,
1250        blank_lines: 0,
1251        mixed_lines_separate: 0,
1252        functions: 0,
1253        classes: 0,
1254        variables: 0,
1255        imports: 0,
1256        test_count: 0,
1257        test_assertion_count: 0,
1258        test_suite_count: 0,
1259        coverage_lines_found: 0,
1260        coverage_lines_hit: 0,
1261        coverage_functions_found: 0,
1262        coverage_functions_hit: 0,
1263        coverage_branches_found: 0,
1264        coverage_branches_hit: 0,
1265    }
1266}
1267
1268/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1269fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1270    entry.files += 1;
1271    let r = &record.raw_line_categories;
1272    entry.total_physical_lines += r.total_physical_lines;
1273    entry.code_lines += record.effective_counts.code_lines;
1274    entry.comment_lines += record.effective_counts.comment_lines;
1275    entry.blank_lines += record.effective_counts.blank_lines;
1276    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1277    entry.functions += r.functions;
1278    entry.classes += r.classes;
1279    entry.variables += r.variables;
1280    entry.imports += r.imports;
1281    entry.test_count += r.test_count;
1282    entry.test_assertion_count += r.test_assertion_count;
1283    entry.test_suite_count += r.test_suite_count;
1284    if let Some(cov) = &record.coverage {
1285        entry.coverage_lines_found += u64::from(cov.lines_found);
1286        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1287        entry.coverage_functions_found += u64::from(cov.functions_found);
1288        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1289        entry.coverage_branches_found += u64::from(cov.branches_found);
1290        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1291    }
1292}
1293
1294fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1295    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1296    for record in analyzed {
1297        let Some(language) = record.language else {
1298            continue;
1299        };
1300        let entry = by_language
1301            .entry(language)
1302            .or_insert_with(|| zeroed_summary(language));
1303        accumulate_record_into_summary(entry, record);
1304    }
1305    by_language.into_values().collect()
1306}
1307
1308fn skipped_record(
1309    path: &Path,
1310    root: &Path,
1311    size_bytes: u64,
1312    status: FileStatus,
1313    warnings: Vec<String>,
1314) -> FileRecord {
1315    FileRecord {
1316        path: path_to_string(path),
1317        relative_path: relative_path_string(path, root),
1318        language: None,
1319        size_bytes,
1320        detected_encoding: None,
1321        raw_line_categories: RawLineCounts::default(),
1322        effective_counts: EffectiveCounts::default(),
1323        status,
1324        warnings,
1325        generated: false,
1326        minified: false,
1327        vendor: false,
1328        parse_mode: None,
1329        submodule: None,
1330        coverage: None,
1331    }
1332}
1333
1334fn relative_path_string(path: &Path, root: &Path) -> String {
1335    path.strip_prefix(root)
1336        .unwrap_or(path)
1337        .to_string_lossy()
1338        .replace('\\', "/")
1339}
1340
1341fn path_to_string(path: &Path) -> String {
1342    path.to_string_lossy().replace('\\', "/")
1343}
1344
1345/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1346#[must_use]
1347pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1348    let gitmodules = root.join(".gitmodules");
1349    if !gitmodules.is_file() {
1350        return Vec::new();
1351    }
1352    let Ok(content) = fs::read_to_string(&gitmodules) else {
1353        return Vec::new();
1354    };
1355
1356    let mut result = Vec::new();
1357    let mut current_name: Option<String> = None;
1358    let mut current_path: Option<PathBuf> = None;
1359
1360    for line in content.lines() {
1361        let trimmed = line.trim();
1362        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1363            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1364                result.push((name, path));
1365            }
1366            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1367            current_name = Some(name);
1368        } else if let Some(rest) = trimmed.strip_prefix("path") {
1369            if let Some(eq_pos) = rest.find('=') {
1370                let path_str = rest[eq_pos + 1..].trim();
1371                current_path = Some(PathBuf::from(path_str));
1372            }
1373        }
1374    }
1375    if let (Some(name), Some(path)) = (current_name, current_path) {
1376        result.push((name, path));
1377    }
1378
1379    result
1380}
1381
1382fn build_submodule_summaries(
1383    analyzed: &[FileRecord],
1384    submodules: &[(String, PathBuf)],
1385) -> Vec<SubmoduleSummary> {
1386    submodules
1387        .iter()
1388        .map(|(name, path)| {
1389            let files: Vec<&FileRecord> = analyzed
1390                .iter()
1391                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1392                .collect();
1393
1394            let files_analyzed = files.len() as u64;
1395            let total_physical_lines = files
1396                .iter()
1397                .map(|f| f.raw_line_categories.total_physical_lines)
1398                .sum();
1399            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1400            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1401            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1402            let language_summaries = build_language_summaries_from_slice(&files);
1403
1404            SubmoduleSummary {
1405                name: name.clone(),
1406                relative_path: path.to_string_lossy().replace('\\', "/"),
1407                files_analyzed,
1408                total_physical_lines,
1409                code_lines,
1410                comment_lines,
1411                blank_lines,
1412                language_summaries,
1413            }
1414        })
1415        .filter(|s| s.files_analyzed > 0)
1416        .collect()
1417}
1418
1419fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1420    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1421    for file in files {
1422        let Some(lang) = file.language else { continue };
1423        let entry = map
1424            .entry(lang.display_name().to_string())
1425            .or_insert_with(|| zeroed_summary(lang));
1426        accumulate_record_into_summary(entry, file);
1427    }
1428    map.into_values().collect()
1429}
1430
1431fn file_name_eq(path: &Path, expected: &str) -> bool {
1432    path.file_name()
1433        .and_then(|name| name.to_str())
1434        .is_some_and(|name| name == expected)
1435}
1436
1437fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1438    path.components().any(|component| {
1439        component
1440            .as_os_str()
1441            .to_str()
1442            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1443    })
1444}
1445
1446fn is_vendor_path(path: &Path) -> bool {
1447    path.components().any(|component| {
1448        component
1449            .as_os_str()
1450            .to_str()
1451            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1452    })
1453}
1454
1455fn is_known_lockfile(path: &Path) -> bool {
1456    path.file_name()
1457        .and_then(|name| name.to_str())
1458        .is_some_and(|name| {
1459            matches!(
1460                name,
1461                "Cargo.lock"
1462                    | "package-lock.json"
1463                    | "yarn.lock"
1464                    | "pnpm-lock.yaml"
1465                    | "Pipfile.lock"
1466                    | "poetry.lock"
1467                    | "composer.lock"
1468            )
1469        })
1470}
1471
1472fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1473    let file_name = path
1474        .file_name()
1475        .and_then(|name| name.to_str())
1476        .unwrap_or_default();
1477    if file_name.contains(".generated.") || file_name.contains(".g.") {
1478        return true;
1479    }
1480
1481    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1482        .to_ascii_lowercase();
1483    sample.contains("@generated") || sample.contains("generated by")
1484}
1485
1486fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1487    let file_name = path
1488        .file_name()
1489        .and_then(|name| name.to_str())
1490        .unwrap_or_default();
1491    if file_name.contains(".min.") {
1492        return true;
1493    }
1494
1495    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1496    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1497    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1498    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1499}
1500
1501fn is_binary(bytes: &[u8]) -> bool {
1502    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1503        || bytes.starts_with(&[0xFF, 0xFE])
1504        || bytes.starts_with(&[0xFE, 0xFF])
1505    {
1506        return false;
1507    }
1508
1509    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1510    sample.contains(&0)
1511}
1512
1513/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
1514/// Returns `(text, encoding_label, warnings)`.
1515fn decode_utf16_bom(
1516    bom_stripped: &[u8],
1517    encoding: &'static encoding_rs::Encoding,
1518    label: &str,
1519) -> (String, String, Vec<String>) {
1520    let (cow, _, had_errors) = encoding.decode(bom_stripped);
1521    let mut warnings = Vec::new();
1522    if had_errors {
1523        warnings.push(format!("{label} decode contained replacement characters"));
1524    }
1525    (cow.into_owned(), label.into(), warnings)
1526}
1527
1528fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1529    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1530        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1531        return Ok((text, "utf-8-bom".into(), vec![]));
1532    }
1533    if bytes.starts_with(&[0xFF, 0xFE]) {
1534        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1535    }
1536    if bytes.starts_with(&[0xFE, 0xFF]) {
1537        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1538    }
1539
1540    // Multiple statements in the else branch make map_or_else awkward here.
1541    #[allow(clippy::option_if_let_else)]
1542    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1543        Ok((text, "utf-8".into(), vec![]))
1544    } else {
1545        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1546        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1547        if had_errors {
1548            warnings.push("fallback decode contained replacement characters".into());
1549        }
1550        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1551    }
1552}
1553
1554fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1555    if patterns.is_empty() {
1556        return Ok(None);
1557    }
1558
1559    let mut builder = GlobSetBuilder::new();
1560    for pattern in patterns {
1561        builder
1562            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1563    }
1564    Ok(Some(
1565        builder.build().context("failed to compile glob filters")?,
1566    ))
1567}
1568
1569fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1570    if enabled.is_empty() {
1571        return Ok(None);
1572    }
1573
1574    let supported = supported_languages();
1575    let mut set = BTreeSet::new();
1576    for name in enabled {
1577        let language = Language::from_name(name)
1578            .with_context(|| format!("unsupported language in config: {name}"))?;
1579        if !supported.contains(&language) {
1580            anyhow::bail!("language {name} is not supported in this build");
1581        }
1582        set.insert(language);
1583    }
1584    Ok(Some(set))
1585}
1586
1587/// # Errors
1588///
1589/// Returns an error if serialization fails or the output file cannot be written.
1590pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1591    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1592    fs::write(output_path, json)
1593        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1594}
1595
1596/// # Errors
1597///
1598/// Returns an error if the file cannot be read or the JSON cannot be parsed.
1599pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1600    let contents = fs::read_to_string(path)
1601        .with_context(|| format!("failed to read result file {}", path.display()))?;
1602    serde_json::from_str(&contents)
1603        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1604}
1605
1606#[cfg(test)]
1607mod tests {
1608    use super::*;
1609
1610    #[test]
1611    fn effective_counts_respect_code_only_policy() {
1612        let raw = RawLineCounts {
1613            code_only_lines: 2,
1614            single_comment_only_lines: 1,
1615            mixed_code_single_comment_lines: 3,
1616            docstring_comment_lines: 2,
1617            ..RawLineCounts::default()
1618        };
1619        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1620        assert_eq!(counts.code_lines, 5);
1621        assert_eq!(counts.comment_lines, 3);
1622    }
1623
1624    #[test]
1625    fn effective_counts_can_separate_mixed() {
1626        let raw = RawLineCounts {
1627            mixed_code_single_comment_lines: 2,
1628            mixed_code_multi_comment_lines: 1,
1629            ..RawLineCounts::default()
1630        };
1631        let counts =
1632            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1633        assert_eq!(counts.mixed_lines_separate, 3);
1634        assert_eq!(counts.code_lines, 0);
1635        assert_eq!(counts.comment_lines, 0);
1636    }
1637
1638    #[test]
1639    fn windows_1252_fallback_decodes() {
1640        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1641        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1642        assert_eq!(encoding, "windows-1252");
1643        assert!(text.contains('–'));
1644        assert!(!warnings.is_empty());
1645    }
1646}