sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{
12    compute_delta, compute_multi_delta, FileChangeStatus, FileDelta, MultiFileDelta,
13    MultiScanComparison, MultiScanPoint, ScanComparison, SummaryDelta,
14};
15pub use history::{
16    CleanupPolicy, CleanupPolicyStore, RegistryEntry, ScanRegistry, ScanSummarySnapshot,
17    WatchedDirsStore,
18};
19
20use std::collections::{BTreeMap, BTreeSet, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
24use std::sync::Arc;
25
26use anyhow::{Context, Result};
27use chrono::{DateTime, Utc};
28use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
29use globset::{Glob, GlobSet, GlobSetBuilder};
30use ignore::WalkBuilder;
31use serde::{Deserialize, Serialize};
32use uuid::Uuid;
33
34use sloc_config::{
35    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
36    FailureBehavior, MixedLinePolicy,
37};
38use sloc_languages::style::IndentStyle;
39use sloc_languages::{
40    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
41    RawLineCounts, StyleAnalysis, StyleLangScope,
42};
43
44// ── Detection sample sizes and thresholds ────────────────────────────────────
45
46/// Maximum number of worker threads used for parallel file analysis.
47const MAX_ANALYSIS_THREADS: usize = 16;
48/// Fallback thread count when `available_parallelism` is unavailable.
49const DEFAULT_ANALYSIS_THREADS: usize = 4;
50/// Byte sample used to detect `@generated` markers.
51const GENERATED_SAMPLE_BYTES: usize = 1024;
52/// Byte sample used to detect minified files via line-length heuristic.
53const MINIFIED_SAMPLE_BYTES: usize = 4096;
54/// Longest line length above which a file is considered minified.
55const MINIFIED_LINE_THRESHOLD: usize = 2000;
56/// Byte sample used to detect binary files via null-byte scan.
57const BINARY_SAMPLE_BYTES: usize = 8192;
58
59/// Atomics shared between `analyze()` and the caller so the caller can poll scan progress.
60pub struct ProgressCounters {
61    /// Number of candidate files processed so far (incremented per file, across all threads).
62    pub files_done: Arc<AtomicUsize>,
63    /// Total candidate files discovered (set before parallel analysis begins).
64    pub files_total: Arc<AtomicUsize>,
65}
66
67/// Three-way outcome for metadata-level policy checks.
68enum MetadataPolicyOutcome {
69    /// Skip this file — include the record in output.
70    Skip(Box<FileRecord>),
71    /// Exclude this file entirely — no record in output (include-glob miss).
72    Exclude,
73    /// Continue to content checks.
74    Continue,
75}
76
77#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
78#[serde(rename_all = "snake_case")]
79pub enum FileStatus {
80    AnalyzedExact,
81    AnalyzedBestEffort,
82    SkippedBinary,
83    SkippedDecodeError,
84    SkippedUnsupported,
85    SkippedByPolicy,
86    ErrorInternal,
87}
88
89/// COCOMO I (Basic) project mode — determines the a/b/c/d exponent coefficients.
90#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
91#[serde(rename_all = "snake_case")]
92pub enum CocomoMode {
93    /// Small team, familiar domain. Effort = 2.4 × KSLOC^1.05.
94    #[default]
95    Organic,
96    /// Mixed constraints. Effort = 3.0 × KSLOC^1.12.
97    SemiDetached,
98    /// Tight hardware/OS constraints. Effort = 3.6 × KSLOC^1.20.
99    Embedded,
100}
101
102/// COCOMO I (Basic) cost-estimation result derived from total code SLOC.
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct CocomoEstimate {
105    pub mode: CocomoMode,
106    /// Input: code lines in thousands (KSLOC).
107    pub ksloc: f64,
108    /// Estimated development effort in person-months.
109    pub effort_person_months: f64,
110    /// Estimated schedule duration in months.
111    pub duration_months: f64,
112    /// Average team size (effort ÷ duration).
113    pub avg_staff: f64,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize, Default)]
117pub struct EffectiveCounts {
118    pub code_lines: u64,
119    pub comment_lines: u64,
120    pub blank_lines: u64,
121    pub mixed_lines_separate: u64,
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub struct ToolMetadata {
126    pub name: String,
127    pub version: String,
128    pub run_id: String,
129    pub timestamp_utc: DateTime<Utc>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct EnvironmentMetadata {
134    pub operating_system: String,
135    pub architecture: String,
136    pub runtime_mode: String,
137    pub initiator_username: String,
138    pub initiator_hostname: String,
139    /// CI system name when the scan runs inside a known CI environment (Jenkins,
140    /// GitHub Actions, GitLab CI, …). `None` for interactive / local runs.
141    #[serde(default, skip_serializing_if = "Option::is_none")]
142    pub ci_name: Option<String>,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize, Default)]
146pub struct SummaryTotals {
147    pub files_considered: u64,
148    pub files_analyzed: u64,
149    pub files_skipped: u64,
150    pub total_physical_lines: u64,
151    pub code_lines: u64,
152    pub comment_lines: u64,
153    pub blank_lines: u64,
154    pub mixed_lines_separate: u64,
155    #[serde(default)]
156    pub functions: u64,
157    #[serde(default)]
158    pub classes: u64,
159    #[serde(default)]
160    pub variables: u64,
161    #[serde(default)]
162    pub imports: u64,
163    #[serde(default)]
164    pub test_count: u64,
165    /// Lexically detected test assertion call lines across all analyzed files.
166    #[serde(default)]
167    pub test_assertion_count: u64,
168    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
169    #[serde(default)]
170    pub test_suite_count: u64,
171    /// Aggregated from LCOV data when provided.
172    #[serde(default)]
173    pub coverage_lines_found: u64,
174    #[serde(default)]
175    pub coverage_lines_hit: u64,
176    #[serde(default)]
177    pub coverage_functions_found: u64,
178    #[serde(default)]
179    pub coverage_functions_hit: u64,
180    #[serde(default)]
181    pub coverage_branches_found: u64,
182    #[serde(default)]
183    pub coverage_branches_hit: u64,
184    /// Sum of per-file cyclomatic complexity scores across all analyzed files.
185    #[serde(default)]
186    pub cyclomatic_complexity: u64,
187    /// Total logical SLOC across files that support it; `None` if no files produced LSLOC.
188    #[serde(default, skip_serializing_if = "Option::is_none")]
189    pub lsloc: Option<u64>,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct LanguageSummary {
194    pub language: Language,
195    pub files: u64,
196    pub total_physical_lines: u64,
197    pub code_lines: u64,
198    pub comment_lines: u64,
199    pub blank_lines: u64,
200    pub mixed_lines_separate: u64,
201    #[serde(default)]
202    pub functions: u64,
203    #[serde(default)]
204    pub classes: u64,
205    #[serde(default)]
206    pub variables: u64,
207    #[serde(default)]
208    pub imports: u64,
209    #[serde(default)]
210    pub test_count: u64,
211    #[serde(default)]
212    pub test_assertion_count: u64,
213    #[serde(default)]
214    pub test_suite_count: u64,
215    #[serde(default)]
216    pub coverage_lines_found: u64,
217    #[serde(default)]
218    pub coverage_lines_hit: u64,
219    #[serde(default)]
220    pub coverage_functions_found: u64,
221    #[serde(default)]
222    pub coverage_functions_hit: u64,
223    #[serde(default)]
224    pub coverage_branches_found: u64,
225    #[serde(default)]
226    pub coverage_branches_hit: u64,
227    #[serde(default)]
228    pub cyclomatic_complexity: u64,
229    #[serde(default, skip_serializing_if = "Option::is_none")]
230    pub lsloc: Option<u64>,
231}
232
233#[derive(Debug, Clone, Serialize, Deserialize)]
234pub struct FileRecord {
235    pub path: String,
236    pub relative_path: String,
237    pub language: Option<Language>,
238    pub size_bytes: u64,
239    pub detected_encoding: Option<String>,
240    pub raw_line_categories: RawLineCounts,
241    pub effective_counts: EffectiveCounts,
242    pub status: FileStatus,
243    pub warnings: Vec<String>,
244    pub generated: bool,
245    pub minified: bool,
246    pub vendor: bool,
247    pub parse_mode: Option<ParseMode>,
248    #[serde(skip_serializing_if = "Option::is_none")]
249    pub submodule: Option<String>,
250    /// Line/function/branch coverage from an external LCOV file, when provided.
251    #[serde(default, skip_serializing_if = "Option::is_none")]
252    pub coverage: Option<FileCoverage>,
253    /// Lexical style-guide adherence analysis; `None` for unsupported languages.
254    #[serde(default, skip_serializing_if = "Option::is_none")]
255    pub style_analysis: Option<StyleAnalysis>,
256    /// Cyclomatic complexity approximation for this file (sum of branch decision keywords).
257    #[serde(default, skip_serializing_if = "Option::is_none")]
258    pub cyclomatic_complexity: Option<u32>,
259    /// Logical SLOC estimate; `None` when the language does not support lexical LSLOC.
260    #[serde(default, skip_serializing_if = "Option::is_none")]
261    pub lsloc: Option<u32>,
262    /// SHA-256 (first 8 bytes as u64) of raw file bytes — used for duplicate detection.
263    /// Not serialized; consumed in-process during `assemble_run`.
264    #[serde(skip)]
265    pub content_hash: u64,
266}
267
268/// Per-language-family style aggregation within a `StyleSummary`.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct LanguageStyleGroup {
271    /// Display label, e.g. `"C / C++"`, `"Python"`, `"JavaScript"`.
272    pub language_family: String,
273    /// Number of files in this group.
274    pub files_count: u32,
275    /// Name of the guide with the highest average adherence.
276    pub dominant_guide: String,
277    /// Average adherence of the dominant guide (0–100).
278    pub dominant_score_pct: u8,
279    /// Most common indent style across the group.
280    pub common_indent_style: String,
281    /// Average guide adherence scores (guide name, 0–100) sorted descending.
282    pub guide_avg_scores: Vec<(String, u8)>,
283    /// Percentage of files (0–100) where ≤ 5 % of lines exceed the configured column threshold.
284    pub line80_compliant_pct: u8,
285    /// Same as `line80_compliant_pct` but named for the actual configured threshold.
286    pub line_col_compliant_pct: u8,
287}
288
289/// Aggregate multi-language style-guide adherence across all analysed files.
290#[derive(Debug, Clone, Serialize, Deserialize)]
291pub struct StyleSummary {
292    /// Total files for which style data was produced.
293    pub files_analyzed: u32,
294    /// Most common indent style across *all* analysed files.
295    pub common_indent_style: String,
296    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over 80 chars (legacy, always 80).
297    pub line80_compliant_pct: u8,
298    /// Percentage of all analysed files (0–100) with ≤ 5 % of lines over `col_threshold` chars.
299    pub line_col_compliant_pct: u8,
300    /// Column-width threshold used for `line_col_compliant_pct` (from `analysis.style_col_threshold`).
301    pub col_threshold: u16,
302    /// Per-language-family breakdown, sorted by `files_count` descending.
303    pub by_language: Vec<LanguageStyleGroup>,
304}
305
306/// Backward-compatible alias kept so that `sloc-report` and `sloc-web` can migrate
307/// incrementally without a breaking change on the same release.
308pub type CppStyleSummary = StyleSummary;
309
310/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
311#[derive(Debug, Clone, Serialize, Deserialize)]
312pub struct SubmoduleSummary {
313    pub name: String,
314    pub relative_path: String,
315    pub files_analyzed: u64,
316    pub total_physical_lines: u64,
317    pub code_lines: u64,
318    pub comment_lines: u64,
319    pub blank_lines: u64,
320    pub language_summaries: Vec<LanguageSummary>,
321    /// Short commit SHA (7 chars) of the submodule's own HEAD at scan time.
322    #[serde(default, skip_serializing_if = "Option::is_none")]
323    pub git_commit_short: Option<String>,
324    /// Full commit SHA of the submodule's own HEAD at scan time.
325    #[serde(default, skip_serializing_if = "Option::is_none")]
326    pub git_commit_long: Option<String>,
327    /// Branch name active in the submodule at scan time.
328    #[serde(default, skip_serializing_if = "Option::is_none")]
329    pub git_branch: Option<String>,
330    /// Author of the submodule's most recent commit at scan time.
331    #[serde(default, skip_serializing_if = "Option::is_none")]
332    pub git_commit_author: Option<String>,
333    /// ISO 8601 author-date of the submodule's most recent commit.
334    #[serde(default, skip_serializing_if = "Option::is_none")]
335    pub git_commit_date: Option<String>,
336    /// URL of the submodule's `origin` remote as recorded in its `.git/config`.
337    #[serde(default, skip_serializing_if = "Option::is_none")]
338    pub git_remote_url: Option<String>,
339}
340
341#[derive(Debug, Clone, Serialize, Deserialize)]
342pub struct AnalysisRun {
343    pub tool: ToolMetadata,
344    pub environment: EnvironmentMetadata,
345    pub effective_configuration: AppConfig,
346    pub input_roots: Vec<String>,
347    pub summary_totals: SummaryTotals,
348    pub totals_by_language: Vec<LanguageSummary>,
349    pub per_file_records: Vec<FileRecord>,
350    pub skipped_file_records: Vec<FileRecord>,
351    pub warnings: Vec<String>,
352    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
353    #[serde(default, skip_serializing_if = "Vec::is_empty")]
354    pub submodule_summaries: Vec<SubmoduleSummary>,
355    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
356    #[serde(default, skip_serializing_if = "Option::is_none")]
357    pub git_commit_short: Option<String>,
358    /// Full git commit SHA at scan time, if the project is a git repo.
359    #[serde(default, skip_serializing_if = "Option::is_none")]
360    pub git_commit_long: Option<String>,
361    /// Git branch active at scan time, if the project is a git repo.
362    #[serde(default, skip_serializing_if = "Option::is_none")]
363    pub git_branch: Option<String>,
364    /// Author of the last git commit at scan time.
365    #[serde(default, skip_serializing_if = "Option::is_none")]
366    pub git_commit_author: Option<String>,
367    /// Comma-separated git tags pointing at HEAD at scan time.
368    #[serde(default, skip_serializing_if = "Option::is_none")]
369    pub git_tags: Option<String>,
370    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
371    #[serde(default, skip_serializing_if = "Option::is_none")]
372    pub git_nearest_tag: Option<String>,
373    /// ISO 8601 author-date of the last git commit at scan time.
374    #[serde(default, skip_serializing_if = "Option::is_none")]
375    pub git_commit_date: Option<String>,
376    /// URL of the `origin` remote as recorded in `.git/config` at scan time.
377    #[serde(default, skip_serializing_if = "Option::is_none")]
378    pub git_remote_url: Option<String>,
379    /// Multi-language style-guide adherence; `None` when no supported files were analysed.
380    #[serde(default, skip_serializing_if = "Option::is_none")]
381    pub style_summary: Option<StyleSummary>,
382    /// COCOMO I (Basic) effort/schedule estimate derived from total code SLOC.
383    #[serde(default, skip_serializing_if = "Option::is_none")]
384    pub cocomo: Option<CocomoEstimate>,
385    /// Unique Lines of Code: count of distinct non-blank code lines across all analyzed files.
386    #[serde(default)]
387    pub uloc: u64,
388    /// `DRYness` percentage: `uloc / total_code_lines × 100`. `None` when code lines = 0.
389    #[serde(default, skip_serializing_if = "Option::is_none")]
390    pub dryness_pct: Option<f32>,
391    /// Groups of files with identical content (relative paths). Only non-singleton groups included.
392    #[serde(default, skip_serializing_if = "Vec::is_empty")]
393    pub duplicate_groups: Vec<Vec<String>>,
394    /// Number of duplicate files excluded from SLOC totals (when `exclude_duplicates` is set).
395    #[serde(default)]
396    pub duplicates_excluded: usize,
397}
398
399#[derive(Default)]
400struct GitInfo {
401    commit_short: Option<String>,
402    commit_long: Option<String>,
403    branch: Option<String>,
404    author: Option<String>,
405    tags: Option<String>,
406    nearest_tag: Option<String>,
407    commit_date: Option<String>,
408    remote_url: Option<String>,
409}
410
411/// Locate the `.git` directory by walking up from `start`.
412/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
413/// submodules. Returns `None` if no git repo is found.
414fn find_git_dir(start: &Path) -> Option<PathBuf> {
415    let mut current = Some(start);
416    while let Some(dir) = current {
417        let candidate = dir.join(".git");
418        if candidate.is_dir() {
419            return Some(candidate);
420        }
421        if candidate.is_file() {
422            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
423                return Some(resolved);
424            }
425        }
426        current = dir.parent();
427    }
428    None
429}
430
431/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
432/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
433/// or if the resolved path is not an existing directory.
434fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
435    let content = fs::read_to_string(file).ok()?;
436    let ptr = content.trim().strip_prefix("gitdir: ")?;
437    // Normalise forward-slash paths to the OS separator so that Path operations
438    // (join, exists, canonicalize) work correctly on Windows.
439    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
440    let resolved = if Path::new(&ptr_native).is_absolute() {
441        PathBuf::from(&ptr_native)
442    } else {
443        base_dir.join(&ptr_native)
444    };
445    // canonicalize resolves ".." components and symlinks; fall back to the
446    // un-canonicalized path if it fails (e.g. some Windows configurations
447    // return a UNC "\\?\" prefix that confuses later path operations).
448    let final_path = resolved.canonicalize().unwrap_or(resolved);
449    if final_path.is_dir() {
450        Some(final_path)
451    } else {
452        None
453    }
454}
455
456/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
457/// Checks loose ref files first, then `packed-refs`.
458fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
459    // Build the OS-native path to the loose ref file by joining each
460    // forward-slash component individually.  This produces the correct
461    // separator on every platform without any manual replacement.
462    let ref_path = refname
463        .split('/')
464        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
465    if ref_path.exists() {
466        let sha = fs::read_to_string(&ref_path)
467            .ok()
468            .map(|s| s.trim().to_string())
469            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
470        if sha.is_some() {
471            return sha;
472        }
473    }
474    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
475    // comments; lines starting with '^' are peeled tag objects to skip).
476    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
477    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
478    for line in packed.lines() {
479        if line.starts_with('#') || line.starts_with('^') {
480            continue;
481        }
482        let mut cols = line.splitn(2, ' ');
483        let sha = cols.next()?;
484        let name = cols.next()?.trim();
485        if name == refname {
486            return Some(sha.to_string());
487        }
488    }
489    None
490}
491
492/// Extract the URL value from a `url = <value>` git-config line, returning `None` if absent or empty.
493fn parse_url_line(line: &str) -> Option<&str> {
494    let rest = line.strip_prefix("url")?;
495    let rest = rest.trim_start_matches([' ', '\t']);
496    let url = rest.strip_prefix('=')?.trim();
497    if url.is_empty() {
498        None
499    } else {
500        Some(url)
501    }
502}
503
504/// Parse `.git/config` and return the URL of the `origin` remote, if present.
505fn read_git_remote_url(git_dir: &Path) -> Option<String> {
506    let config = fs::read_to_string(git_dir.join("config")).ok()?;
507    let mut in_origin = false;
508    for line in config.lines() {
509        let trimmed = line.trim();
510        if trimmed.starts_with('[') {
511            in_origin = trimmed == r#"[remote "origin"]"#;
512        } else if in_origin {
513            if let Some(url) = parse_url_line(trimmed) {
514                return Some(url.to_owned());
515            }
516        }
517    }
518    None
519}
520
521/// Detect git metadata by reading `.git/` files directly — no `git` executable
522/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
523/// reflogs.
524fn detect_git_for_run(project_path: &Path) -> GitInfo {
525    // Resolve the CI branch early so it can fill in any gap in git metadata.
526    let ci_branch = ci_branch_from_env();
527
528    let Some(git_dir) = find_git_dir(project_path) else {
529        // No .git directory (e.g. scanning a non-repo path in CI). Use whatever
530        // the CI system tells us about the branch.
531        return GitInfo {
532            branch: ci_branch,
533            ..GitInfo::default()
534        };
535    };
536
537    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
538        Ok(s) => s.trim().to_string(),
539        Err(_) => {
540            return GitInfo {
541                branch: ci_branch,
542                ..GitInfo::default()
543            }
544        }
545    };
546
547    let (branch_from_head, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
548        || {
549            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
550                // Detached HEAD — HEAD file is the commit SHA (common in CI checkouts).
551                (None, Some(head_raw[..40].to_string()))
552            } else {
553                (None, None)
554            }
555        },
556        |refname| {
557            let branch = refname
558                .strip_prefix("refs/heads/")
559                .map(|b| b.trim().to_string());
560            let sha = resolve_ref(&git_dir, refname.trim());
561            (branch, sha)
562        },
563    );
564    // Prefer the branch name derived from the HEAD ref; fall back to the CI
565    // env var (covers detached-HEAD checkouts done by Jenkins, GitHub Actions, etc.).
566    let branch = branch_from_head.or(ci_branch);
567
568    let commit_short = commit_long
569        .as_deref()
570        .map(|s| s.chars().take(7).collect::<String>());
571
572    let author = run_git_cmd(project_path, &["log", "-1", "--format=%an", "HEAD"]);
573    let commit_date = run_git_cmd(project_path, &["log", "-1", "--format=%aI", "HEAD"]);
574    let remote_url = read_git_remote_url(&git_dir);
575
576    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
577    // but don't block on it. If git isn't available these will simply be None.
578    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
579        t.lines()
580            .filter(|l| !l.is_empty())
581            .collect::<Vec<_>>()
582            .join(", ")
583    });
584    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
585
586    GitInfo {
587        commit_short,
588        commit_long,
589        branch,
590        author,
591        tags,
592        nearest_tag,
593        commit_date,
594        remote_url,
595    }
596}
597
598/// Run a git command as a best-effort supplemental source.
599fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
600    // Try the bare name first (works when git is on PATH), then fall back to
601    // absolute paths for service accounts that run with a stripped PATH.
602    // Unix paths silently fail on Windows and vice-versa.
603    let candidates: &[&str] = &[
604        // Works on all platforms when git is on PATH
605        "git",
606        // Common Linux / macOS install locations
607        "/usr/bin/git",
608        "/usr/local/bin/git",
609        "/opt/homebrew/bin/git",
610        // Git for Windows default installation paths
611        r"C:\Program Files\Git\cmd\git.exe",
612        r"C:\Program Files\Git\bin\git.exe",
613        r"C:\Program Files (x86)\Git\cmd\git.exe",
614    ];
615    for &exe in candidates {
616        let result = std::process::Command::new(exe)
617            .args(["-c", "safe.directory=*"])
618            .args(args)
619            .current_dir(dir)
620            .output()
621            .ok()
622            .filter(|o| o.status.success())
623            .and_then(|o| String::from_utf8(o.stdout).ok())
624            .map(|s| s.trim().to_string())
625            .filter(|s| !s.is_empty());
626        if result.is_some() {
627            return result;
628        }
629    }
630    None
631}
632
633/// Return the name of the CI system if the process is running inside one.
634fn detect_ci_system() -> Option<&'static str> {
635    let ev = |k: &str| std::env::var(k).is_ok();
636    let ev_true = |k: &str| std::env::var(k).as_deref() == Ok("true");
637    if ev("JENKINS_URL") || ev("JENKINS_HOME") || ev("BUILD_URL") {
638        return Some("Jenkins");
639    }
640    if ev_true("GITHUB_ACTIONS") {
641        return Some("GitHub Actions");
642    }
643    if ev_true("GITLAB_CI") {
644        return Some("GitLab CI");
645    }
646    if ev_true("CIRCLECI") {
647        return Some("CircleCI");
648    }
649    if ev_true("TRAVIS") {
650        return Some("Travis CI");
651    }
652    if ev_true("TF_BUILD") {
653        return Some("Azure DevOps");
654    }
655    if ev("TEAMCITY_VERSION") {
656        return Some("TeamCity");
657    }
658    None
659}
660
661/// Read the current branch name from well-known CI environment variables.
662/// Called as a fallback when the git HEAD is detached (common in CI checkouts).
663fn ci_branch_from_env() -> Option<String> {
664    const VARS: &[&str] = &[
665        "BRANCH_NAME",        // Jenkins Pipeline
666        "GIT_BRANCH",         // Jenkins Freestyle (may carry "origin/<branch>")
667        "GITHUB_REF_NAME",    // GitHub Actions
668        "CI_COMMIT_BRANCH",   // GitLab CI
669        "CIRCLE_BRANCH",      // CircleCI
670        "TRAVIS_BRANCH",      // Travis CI
671        "BUILD_SOURCEBRANCH", // Azure DevOps (may carry "refs/heads/<branch>")
672    ];
673    for &var in VARS {
674        if let Ok(val) = std::env::var(var) {
675            let val = val.trim();
676            let val = val
677                .strip_prefix("refs/heads/")
678                .or_else(|| val.strip_prefix("origin/"))
679                .unwrap_or(val);
680            if !val.is_empty() && val != "HEAD" {
681                return Some(val.to_string());
682            }
683        }
684    }
685    None
686}
687
688fn get_current_username() -> String {
689    std::env::var("USERNAME")
690        .or_else(|_| std::env::var("USER"))
691        .unwrap_or_else(|_| "unknown".to_string())
692}
693
694fn non_empty_env(var: &str) -> Option<String> {
695    let v = std::env::var(var).ok()?;
696    if v.is_empty() {
697        None
698    } else {
699        Some(v)
700    }
701}
702
703fn is_jenkins_env() -> bool {
704    std::env::var("JENKINS_URL").is_ok()
705        || std::env::var("JENKINS_HOME").is_ok()
706        || std::env::var("BUILD_URL").is_ok()
707}
708
709fn get_hostname() -> String {
710    // In CI environments prefer a human-readable agent/runner identifier over
711    // whatever hostname the container was assigned.
712    if is_jenkins_env() {
713        if let Some(n) = non_empty_env("NODE_NAME") {
714            return n;
715        }
716    }
717    if std::env::var("GITHUB_ACTIONS").as_deref() == Ok("true") {
718        if let Some(r) = non_empty_env("RUNNER_NAME") {
719            return r;
720        }
721    }
722    if std::env::var("GITLAB_CI").as_deref() == Ok("true") {
723        if let Some(r) = non_empty_env("CI_RUNNER_DESCRIPTION") {
724            return r;
725        }
726    }
727    std::env::var("COMPUTERNAME")
728        .or_else(|_| std::env::var("HOSTNAME"))
729        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
730        .unwrap_or_else(|_| "unknown".to_string())
731}
732
733/// Walk a single directory root and collect file records into the output vectors.
734#[allow(clippy::too_many_arguments)]
735fn walk_root(
736    root: &Path,
737    config: &AppConfig,
738    include_globs: Option<&GlobSet>,
739    exclude_globs: Option<&GlobSet>,
740    enabled_languages: Option<&BTreeSet<Language>>,
741    seen_paths: &mut HashSet<PathBuf>,
742    analyzed: &mut Vec<FileRecord>,
743    skipped: &mut Vec<FileRecord>,
744    warnings: &mut Vec<String>,
745    cancel: Option<&AtomicBool>,
746    progress: Option<&ProgressCounters>,
747) -> Result<()> {
748    let mut builder = WalkBuilder::new(root);
749    builder
750        .follow_links(config.discovery.follow_symlinks)
751        .hidden(config.discovery.ignore_hidden_files)
752        .ignore(config.discovery.honor_ignore_files)
753        .parents(config.discovery.honor_ignore_files)
754        .git_ignore(config.discovery.honor_ignore_files)
755        .git_global(config.discovery.honor_ignore_files)
756        .git_exclude(config.discovery.honor_ignore_files);
757
758    let paths = collect_walk_paths(&builder, seen_paths, warnings);
759    if paths.is_empty() {
760        return Ok(());
761    }
762
763    if let Some(p) = progress {
764        p.files_total.fetch_add(paths.len(), Ordering::Relaxed);
765    }
766
767    let chunk_results = run_parallel_analysis(
768        &paths,
769        root,
770        config,
771        include_globs,
772        exclude_globs,
773        enabled_languages,
774        cancel,
775        progress,
776    )?;
777    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
778}
779
780fn collect_walk_paths(
781    builder: &WalkBuilder,
782    seen_paths: &mut HashSet<PathBuf>,
783    warnings: &mut Vec<String>,
784) -> Vec<PathBuf> {
785    // build_parallel() walks the directory tree across multiple threads (work-stealing
786    // internally), which is meaningfully faster for deeply nested repos with many directories.
787    // We collect results via an MPSC channel so each walker thread sends without contention.
788    let (tx, rx) = std::sync::mpsc::channel::<std::result::Result<PathBuf, String>>();
789
790    builder.build_parallel().run(|| {
791        let tx = tx.clone();
792        Box::new(move |entry| {
793            match entry {
794                Err(e) => {
795                    let _ = tx.send(Err(format!("discovery warning: {e}")));
796                }
797                Ok(e) => {
798                    let path = e.into_path();
799                    if !path.is_dir() {
800                        let _ = tx.send(Ok(path));
801                    }
802                }
803            }
804            ignore::WalkState::Continue
805        })
806    });
807
808    // Drop the sender that the outer scope holds; the per-thread clones were dropped when
809    // run() returned (all threads finished). Dropping this last sender closes the channel.
810    drop(tx);
811
812    rx.into_iter()
813        .filter_map(|msg| match msg {
814            Ok(path) => {
815                if seen_paths.insert(path.clone()) {
816                    Some(path)
817                } else {
818                    None
819                }
820            }
821            Err(warn) => {
822                warnings.push(warn);
823                None
824            }
825        })
826        .collect()
827}
828
829/// Inner work loop executed by each analysis thread.
830#[allow(clippy::too_many_arguments)]
831fn worker_loop(
832    paths: &[PathBuf],
833    root: &Path,
834    config: &AppConfig,
835    include_globs: Option<&GlobSet>,
836    exclude_globs: Option<&GlobSet>,
837    enabled_languages: Option<&BTreeSet<Language>>,
838    cancel: Option<&AtomicBool>,
839    next_index: &AtomicUsize,
840    files_done: Option<&AtomicUsize>,
841) -> Vec<Result<Option<FileRecord>>> {
842    let mut results = Vec::new();
843    loop {
844        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
845            results.push(Err(anyhow::anyhow!("analysis cancelled")));
846            break;
847        }
848        let i = next_index.fetch_add(1, Ordering::Relaxed);
849        if i >= paths.len() {
850            break;
851        }
852        results.push(analyze_candidate_file(
853            &paths[i],
854            root,
855            config,
856            include_globs,
857            exclude_globs,
858            enabled_languages,
859        ));
860        if let Some(fd) = files_done {
861            fd.fetch_add(1, Ordering::Relaxed);
862        }
863    }
864    results
865}
866
867#[allow(clippy::too_many_arguments)]
868fn run_parallel_analysis(
869    paths: &[PathBuf],
870    root: &Path,
871    config: &AppConfig,
872    include_globs: Option<&GlobSet>,
873    exclude_globs: Option<&GlobSet>,
874    enabled_languages: Option<&BTreeSet<Language>>,
875    cancel: Option<&AtomicBool>,
876    progress: Option<&ProgressCounters>,
877) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
878    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
879        n.get().min(MAX_ANALYSIS_THREADS)
880    });
881    // Shared work-queue index: each thread atomically claims the next path to process.
882    // This eliminates static-chunk load imbalance — threads that finish early immediately
883    // pick up more work instead of sitting idle while one overloaded chunk finishes.
884    let next_index = AtomicUsize::new(0);
885    let files_done: Option<&AtomicUsize> = progress.map(|p| p.files_done.as_ref());
886
887    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
888        // IMPORTANT: collect ALL handles before joining any of them.
889        // A lazy spawn-then-join chain would serialize threads one at a time.
890        let mut handles = Vec::with_capacity(thread_count);
891        for _ in 0..thread_count {
892            handles.push(s.spawn(|| {
893                worker_loop(
894                    paths,
895                    root,
896                    config,
897                    include_globs,
898                    exclude_globs,
899                    enabled_languages,
900                    cancel,
901                    &next_index,
902                    files_done,
903                )
904            }));
905        }
906        handles
907            .into_iter()
908            .map(|h| {
909                h.join()
910                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
911            })
912            .collect()
913    })
914}
915
916fn merge_chunk_results(
917    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
918    analyzed: &mut Vec<FileRecord>,
919    skipped: &mut Vec<FileRecord>,
920    warnings: &mut Vec<String>,
921) -> Result<()> {
922    for chunk in chunk_results {
923        for result in chunk {
924            if let Some(record) = result? {
925                push_record(record, analyzed, skipped, warnings);
926            }
927        }
928    }
929    Ok(())
930}
931
932/// Label each analyzed file with its submodule and build per-submodule summaries.
933fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
934    let root = config.discovery.root_paths[0]
935        .canonicalize()
936        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
937    let submodules = detect_submodules(&root);
938    if submodules.is_empty() {
939        return Vec::new();
940    }
941
942    for file in analyzed.iter_mut() {
943        for (name, sub_path) in &submodules {
944            let prefix = sub_path.to_string_lossy().replace('\\', "/");
945            let rel = &file.relative_path;
946            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
947                file.submodule = Some(name.clone());
948                break;
949            }
950        }
951    }
952
953    build_submodule_summaries(analyzed, &submodules, &root)
954}
955
956/// Compute Basic COCOMO I cost estimate from total code SLOC.
957#[allow(clippy::cast_precision_loss)] // COCOMO formula: line counts at f64 precision are sufficient
958fn compute_cocomo(code_lines: u64, mode: CocomoMode) -> CocomoEstimate {
959    let ksloc = code_lines as f64 / 1_000.0;
960    let (a, b, c, d): (f64, f64, f64, f64) = match mode {
961        CocomoMode::Organic => (2.4, 1.05, 2.5, 0.38),
962        CocomoMode::SemiDetached => (3.0, 1.12, 2.5, 0.35),
963        CocomoMode::Embedded => (3.6, 1.20, 2.5, 0.32),
964    };
965    let effort = a * ksloc.powf(b);
966    let duration = c * effort.powf(d);
967    let avg_staff = if duration > 0.0 {
968        effort / duration
969    } else {
970        0.0
971    };
972    // Round to 2 decimal places for readability.
973    CocomoEstimate {
974        mode,
975        ksloc: (ksloc * 100.0).round() / 100.0,
976        effort_person_months: (effort * 100.0).round() / 100.0,
977        duration_months: (duration * 100.0).round() / 100.0,
978        avg_staff: (avg_staff * 100.0).round() / 100.0,
979    }
980}
981
982/// Collect ULOC hashes across all analyzed files, compute ULOC and `DRYness`.
983#[allow(clippy::cast_precision_loss)] // DRYness is a display percentage; f32 precision is adequate
984fn compute_uloc(analyzed: &[FileRecord]) -> (u64, Option<f32>) {
985    use std::collections::HashSet as StdHashSet;
986    let mut unique: StdHashSet<u64> = StdHashSet::new();
987    let mut total_code: u64 = 0;
988    for record in analyzed {
989        total_code += record.effective_counts.code_lines;
990        for &hash in &record.raw_line_categories.code_line_hashes {
991            unique.insert(hash);
992        }
993    }
994    let uloc = unique.len() as u64;
995    let dryness = if total_code > 0 {
996        Some((uloc as f32 / total_code as f32) * 100.0)
997    } else {
998        None
999    };
1000    (uloc, dryness)
1001}
1002
1003/// Group files by content hash and return groups of duplicates (relative paths).
1004/// Only groups with ≥ 2 files are returned.
1005fn find_duplicate_groups(analyzed: &[FileRecord]) -> Vec<Vec<String>> {
1006    let mut by_hash: std::collections::HashMap<u64, Vec<&str>> = std::collections::HashMap::new();
1007    for record in analyzed {
1008        if record.content_hash != 0 {
1009            by_hash
1010                .entry(record.content_hash)
1011                .or_default()
1012                .push(&record.relative_path);
1013        }
1014    }
1015    let mut groups: Vec<Vec<String>> = by_hash
1016        .into_values()
1017        .filter(|v| v.len() >= 2)
1018        .map(|v| {
1019            let mut paths: Vec<String> = v.into_iter().map(str::to_owned).collect();
1020            paths.sort();
1021            paths
1022        })
1023        .collect();
1024    groups.sort_by(|a, b| a[0].cmp(&b[0]));
1025    groups
1026}
1027
1028/// Assemble the final `AnalysisRun` from collected records and metadata.
1029fn assemble_run(
1030    config: &AppConfig,
1031    runtime_mode: &str,
1032    analyzed: Vec<FileRecord>,
1033    skipped: Vec<FileRecord>,
1034    warnings: Vec<String>,
1035    submodule_summaries: Vec<SubmoduleSummary>,
1036) -> AnalysisRun {
1037    let summary = build_summary(&analyzed, &skipped);
1038    let language_summaries = build_language_summaries(&analyzed);
1039    let col_threshold = config.analysis.style_col_threshold;
1040    let style_summary = build_style_summary(&analyzed, col_threshold);
1041
1042    // Compute ULOC, DRYness, duplicates, and COCOMO from the aggregated records.
1043    let (uloc, dryness_pct) = compute_uloc(&analyzed);
1044    let duplicate_groups = find_duplicate_groups(&analyzed);
1045    let cocomo = if summary.code_lines > 0 {
1046        Some(compute_cocomo(summary.code_lines, CocomoMode::Organic))
1047    } else {
1048        None
1049    };
1050
1051    let first_root = config
1052        .discovery
1053        .root_paths
1054        .first()
1055        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
1056    let git = first_root
1057        .as_deref()
1058        .map(detect_git_for_run)
1059        .unwrap_or_default();
1060
1061    let now = Utc::now();
1062    let run_id = {
1063        let uuid_suffix = Uuid::new_v4().simple().to_string();
1064        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
1065    };
1066
1067    AnalysisRun {
1068        tool: ToolMetadata {
1069            name: "sloc".into(),
1070            version: env!("CARGO_PKG_VERSION").into(),
1071            run_id,
1072            timestamp_utc: now,
1073        },
1074        environment: EnvironmentMetadata {
1075            operating_system: std::env::consts::OS.into(),
1076            architecture: std::env::consts::ARCH.into(),
1077            runtime_mode: runtime_mode.into(),
1078            initiator_username: get_current_username(),
1079            initiator_hostname: get_hostname(),
1080            ci_name: if is_jenkins_env() {
1081                Some(format!("Jenkins\t{}", get_hostname()))
1082            } else {
1083                detect_ci_system().map(str::to_string)
1084            },
1085        },
1086        effective_configuration: config.clone(),
1087        input_roots: config
1088            .discovery
1089            .root_paths
1090            .iter()
1091            .map(|p| path_to_string(p))
1092            .collect(),
1093        summary_totals: summary,
1094        totals_by_language: language_summaries,
1095        per_file_records: analyzed,
1096        skipped_file_records: skipped,
1097        warnings,
1098        submodule_summaries,
1099        git_commit_short: git.commit_short,
1100        git_commit_long: git.commit_long,
1101        git_branch: git.branch,
1102        git_commit_author: git.author,
1103        git_tags: git.tags,
1104        git_nearest_tag: git.nearest_tag,
1105        git_commit_date: git.commit_date,
1106        git_remote_url: git.remote_url,
1107        style_summary,
1108        cocomo,
1109        uloc,
1110        dryness_pct,
1111        duplicate_groups,
1112        duplicates_excluded: 0,
1113    }
1114}
1115
1116/// # Errors
1117///
1118/// Returns an error if the config is invalid, root paths cannot be walked, or any file
1119/// analysis step fails in a way that cannot be recovered from.
1120#[allow(clippy::too_many_lines)]
1121pub fn analyze(
1122    config: &AppConfig,
1123    runtime_mode: &str,
1124    cancel: Option<&AtomicBool>,
1125    progress: Option<&ProgressCounters>,
1126) -> Result<AnalysisRun> {
1127    config.validate()?;
1128
1129    if config.discovery.root_paths.is_empty() {
1130        anyhow::bail!("no input paths were provided");
1131    }
1132
1133    let include_globs = compile_globset(&config.discovery.include_globs)?;
1134    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
1135    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
1136
1137    let mut analyzed = Vec::new();
1138    let mut skipped = Vec::new();
1139    let mut warnings = Vec::new();
1140    let mut seen_paths = HashSet::new();
1141
1142    for root in &config.discovery.root_paths {
1143        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
1144            anyhow::bail!("analysis cancelled");
1145        }
1146
1147        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
1148
1149        if root.is_file() {
1150            if let Some(record) = analyze_candidate_file(
1151                &root,
1152                root.parent().unwrap_or_else(|| Path::new(".")),
1153                config,
1154                include_globs.as_ref(),
1155                exclude_globs.as_ref(),
1156                enabled_languages.as_ref(),
1157            )? {
1158                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
1159            }
1160            continue;
1161        }
1162
1163        walk_root(
1164            &root,
1165            config,
1166            include_globs.as_ref(),
1167            exclude_globs.as_ref(),
1168            enabled_languages.as_ref(),
1169            &mut seen_paths,
1170            &mut analyzed,
1171            &mut skipped,
1172            &mut warnings,
1173            cancel,
1174            progress,
1175        )?;
1176    }
1177
1178    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1179    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
1180
1181    // Submodule detection: label each file with its submodule and build per-submodule summaries.
1182    let submodule_summaries = if config.discovery.submodule_breakdown {
1183        process_submodules(config, &mut analyzed)
1184    } else {
1185        Vec::new()
1186    };
1187
1188    attach_coverage(config, &mut analyzed, &mut warnings);
1189
1190    Ok(assemble_run(
1191        config,
1192        runtime_mode,
1193        analyzed,
1194        skipped,
1195        warnings,
1196        submodule_summaries,
1197    ))
1198}
1199
1200fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
1201    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
1202    else {
1203        return;
1204    };
1205    tracing::debug!(path = %cov_path.display(), "loading coverage file");
1206    match fs::read_to_string(&cov_path) {
1207        Ok(content) => {
1208            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
1209            let mut matched: u32 = 0;
1210            let mut unmatched: u32 = 0;
1211            for record in analyzed.iter_mut() {
1212                record.coverage =
1213                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
1214                if record.coverage.is_some() {
1215                    matched += 1;
1216                } else {
1217                    unmatched += 1;
1218                }
1219            }
1220            tracing::debug!(
1221                path = %cov_path.display(),
1222                coverage_entries = cov_map.len(),
1223                files_matched = matched,
1224                files_unmatched = unmatched,
1225                "coverage attached"
1226            );
1227            if unmatched > 0 && matched == 0 {
1228                tracing::warn!(
1229                    path = %cov_path.display(),
1230                    "coverage file loaded but no source files could be matched — check that paths in the coverage report match the scanned directory"
1231                );
1232            }
1233        }
1234        Err(e) => {
1235            tracing::warn!(path = %cov_path.display(), error = %e, "coverage file could not be read");
1236            warnings.push(format!(
1237                "coverage file '{}' could not be read: {e}",
1238                cov_path.display()
1239            ));
1240        }
1241    }
1242}
1243
1244fn push_record(
1245    record: FileRecord,
1246    analyzed: &mut Vec<FileRecord>,
1247    skipped: &mut Vec<FileRecord>,
1248    warnings: &mut Vec<String>,
1249) {
1250    warnings.extend(
1251        record
1252            .warnings
1253            .iter()
1254            .map(|warning| format!("{}: {warning}", record.relative_path)),
1255    );
1256
1257    match record.status {
1258        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
1259        _ => skipped.push(record),
1260    }
1261}
1262
1263/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
1264#[inline]
1265fn skip_with_reason(
1266    path: &Path,
1267    root: &Path,
1268    size: u64,
1269    reason: impl Into<String>,
1270) -> MetadataPolicyOutcome {
1271    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
1272        path,
1273        root,
1274        size,
1275        FileStatus::SkippedByPolicy,
1276        vec![reason.into()],
1277    )))
1278}
1279
1280/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
1281/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
1282/// or `Continue` to proceed to content checks.
1283#[allow(clippy::too_many_arguments)]
1284fn check_metadata_policy(
1285    path: &Path,
1286    root: &Path,
1287    relative_path: &str,
1288    metadata: &fs::Metadata,
1289    config: &AppConfig,
1290    include_globs: Option<&GlobSet>,
1291    exclude_globs: Option<&GlobSet>,
1292) -> MetadataPolicyOutcome {
1293    let size = metadata.len();
1294
1295    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
1296        return skip_with_reason(path, root, size, "symlink skipped by policy");
1297    }
1298    if file_name_eq(path, ".gitignore") {
1299        return skip_with_reason(path, root, size, ".gitignore is always excluded");
1300    }
1301    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
1302        return skip_with_reason(path, root, size, "path matched excluded directory setting");
1303    }
1304    if size > config.discovery.max_file_size_bytes {
1305        return skip_with_reason(
1306            path,
1307            root,
1308            size,
1309            format!(
1310                "file exceeded max_file_size_bytes ({})",
1311                config.discovery.max_file_size_bytes
1312            ),
1313        );
1314    }
1315    if let Some(globs) = include_globs {
1316        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
1317            return MetadataPolicyOutcome::Exclude;
1318        }
1319    }
1320    if let Some(globs) = exclude_globs {
1321        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
1322            return skip_with_reason(path, root, size, "path matched exclude glob");
1323        }
1324    }
1325    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
1326        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
1327    }
1328
1329    MetadataPolicyOutcome::Continue
1330}
1331
1332struct ContentPolicyResult {
1333    vendor: bool,
1334    generated: bool,
1335    minified: bool,
1336    skip_record: Option<FileRecord>,
1337}
1338
1339/// Apply content-level policy checks (vendor, generated, minified).
1340/// `skip_record` is `Some` when the file should be skipped.
1341fn check_content_policy(
1342    path: &Path,
1343    root: &Path,
1344    size_bytes: u64,
1345    bytes: &[u8],
1346    config: &AppConfig,
1347) -> ContentPolicyResult {
1348    let vendor = is_vendor_path(path);
1349    if vendor && config.analysis.vendor_directory_detection {
1350        return ContentPolicyResult {
1351            vendor,
1352            generated: false,
1353            minified: false,
1354            skip_record: Some(skipped_record(
1355                path,
1356                root,
1357                size_bytes,
1358                FileStatus::SkippedByPolicy,
1359                vec!["vendor file skipped by policy".into()],
1360            )),
1361        };
1362    }
1363
1364    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
1365    if generated {
1366        return ContentPolicyResult {
1367            vendor,
1368            generated,
1369            minified: false,
1370            skip_record: Some(skipped_record(
1371                path,
1372                root,
1373                size_bytes,
1374                FileStatus::SkippedByPolicy,
1375                vec!["generated file skipped by policy".into()],
1376            )),
1377        };
1378    }
1379
1380    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
1381    if minified {
1382        return ContentPolicyResult {
1383            vendor,
1384            generated,
1385            minified,
1386            skip_record: Some(skipped_record(
1387                path,
1388                root,
1389                size_bytes,
1390                FileStatus::SkippedByPolicy,
1391                vec!["minified file skipped by policy".into()],
1392            )),
1393        };
1394    }
1395
1396    ContentPolicyResult {
1397        vendor,
1398        generated,
1399        minified,
1400        skip_record: None,
1401    }
1402}
1403
1404/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
1405fn decode_file_contents(
1406    path: &Path,
1407    root: &Path,
1408    size_bytes: u64,
1409    bytes: &[u8],
1410    config: &AppConfig,
1411) -> Result<Option<(String, String, Vec<String>)>> {
1412    if is_binary(bytes) {
1413        return match config.analysis.binary_file_behavior {
1414            BinaryFileBehavior::Skip => Ok(None),
1415            BinaryFileBehavior::Fail => {
1416                anyhow::bail!("binary file encountered: {}", path.display())
1417            }
1418        };
1419    }
1420
1421    match decode_bytes(bytes) {
1422        Ok(result) => Ok(Some(result)),
1423        Err(err) => match config.analysis.decode_failure_behavior {
1424            FailureBehavior::WarnSkip => {
1425                // Caller will handle the None as a SkippedDecodeError record.
1426                // We use a sentinel: return Ok(None) but encode the error into a field.
1427                // Instead, propagate as a skipped record via the caller.
1428                let _ = (path, root, size_bytes); // suppress unused warnings
1429                Err(anyhow::anyhow!("__decode_warn__: {err}"))
1430            }
1431            FailureBehavior::Fail => {
1432                anyhow::bail!("decode failure for {}: {err}", path.display())
1433            }
1434        },
1435    }
1436}
1437
1438#[allow(clippy::too_many_lines)]
1439fn analyze_candidate_file(
1440    path: &Path,
1441    root: &Path,
1442    config: &AppConfig,
1443    include_globs: Option<&GlobSet>,
1444    exclude_globs: Option<&GlobSet>,
1445    enabled_languages: Option<&BTreeSet<Language>>,
1446) -> Result<Option<FileRecord>> {
1447    let metadata = match fs::symlink_metadata(path) {
1448        Ok(metadata) => metadata,
1449        Err(err) => {
1450            return Ok(Some(skipped_record(
1451                path,
1452                root,
1453                0,
1454                FileStatus::ErrorInternal,
1455                vec![format!("failed to read metadata: {err}")],
1456            )));
1457        }
1458    };
1459
1460    let relative_path = relative_path_string(path, root);
1461
1462    // Metadata-level policy checks.
1463    match check_metadata_policy(
1464        path,
1465        root,
1466        &relative_path,
1467        &metadata,
1468        config,
1469        include_globs,
1470        exclude_globs,
1471    ) {
1472        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1473        MetadataPolicyOutcome::Exclude => return Ok(None),
1474        MetadataPolicyOutcome::Continue => {}
1475    }
1476
1477    let bytes = match fs::read(path) {
1478        Ok(bytes) => bytes,
1479        Err(err) => {
1480            return Ok(Some(skipped_record(
1481                path,
1482                root,
1483                metadata.len(),
1484                FileStatus::ErrorInternal,
1485                vec![format!("failed to read file: {err}")],
1486            )));
1487        }
1488    };
1489
1490    // Content-level policy checks (vendor, generated, minified).
1491    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1492    if let Some(record) = content_policy.skip_record {
1493        return Ok(Some(record));
1494    }
1495    let (vendor, generated, minified) = (
1496        content_policy.vendor,
1497        content_policy.generated,
1498        content_policy.minified,
1499    );
1500
1501    // Decode content, handling binary and decode failures.
1502    let (text, encoding, decode_warnings) =
1503        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1504            Ok(Some(result)) => result,
1505            Ok(None) => {
1506                return Ok(Some(skipped_record(
1507                    path,
1508                    root,
1509                    metadata.len(),
1510                    FileStatus::SkippedBinary,
1511                    vec!["binary file skipped by default".into()],
1512                )));
1513            }
1514            Err(err) => {
1515                let msg = err.to_string();
1516                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1517                    return Ok(Some(skipped_record(
1518                        path,
1519                        root,
1520                        metadata.len(),
1521                        FileStatus::SkippedDecodeError,
1522                        vec![warn_msg.to_string()],
1523                    )));
1524                }
1525                return Err(err);
1526            }
1527        };
1528
1529    let first_line = text.lines().next();
1530    let language = detect_language(
1531        path,
1532        first_line,
1533        &config.analysis.extension_overrides,
1534        config.analysis.shebang_detection,
1535    );
1536
1537    let Some(language) = language else {
1538        return Ok(Some(skipped_record(
1539            path,
1540            root,
1541            metadata.len(),
1542            FileStatus::SkippedUnsupported,
1543            vec!["unsupported or undetected language".into()],
1544        )));
1545    };
1546
1547    if let Some(enabled) = enabled_languages {
1548        if !enabled.contains(&language) {
1549            return Ok(Some(skipped_record(
1550                path,
1551                root,
1552                metadata.len(),
1553                FileStatus::SkippedByPolicy,
1554                vec![format!(
1555                    "language {} disabled by configuration",
1556                    language.display_name()
1557                )],
1558            )));
1559        }
1560    }
1561
1562    let style_scope = match config.analysis.style_lang_scope.as_str() {
1563        "c_family" => StyleLangScope::CFamilyOnly,
1564        _ => StyleLangScope::All,
1565    };
1566    let ieee_opts = AnalysisOptions {
1567        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1568            == BlankInBlockCommentPolicy::CountAsComment,
1569        collapse_continuation_lines: config.analysis.continuation_line_policy
1570            == ContinuationLinePolicy::CollapseToLogical,
1571        enable_style: config.analysis.style_analysis_enabled,
1572        style_lang_scope: style_scope,
1573    };
1574    let analysis = analyze_text(language, &text, ieee_opts);
1575    let effective_counts = compute_effective_counts(
1576        &analysis.raw,
1577        config.analysis.mixed_line_policy,
1578        config.analysis.python_docstrings_as_comments,
1579        config.analysis.count_compiler_directives,
1580    );
1581
1582    let mut warnings = decode_warnings;
1583    warnings.extend(analysis.warnings.clone());
1584
1585    // Compute a fast 64-bit content fingerprint for duplicate-file detection.
1586    let content_hash = {
1587        use std::hash::{DefaultHasher, Hash, Hasher};
1588        let mut h = DefaultHasher::new();
1589        bytes.hash(&mut h);
1590        h.finish()
1591    };
1592
1593    // Extract fields from analysis.raw before it is moved into FileRecord.
1594    let cyclomatic_complexity = if analysis.raw.cyclomatic_complexity > 0 {
1595        Some(analysis.raw.cyclomatic_complexity)
1596    } else {
1597        None
1598    };
1599    let lsloc = analysis.raw.lsloc;
1600
1601    Ok(Some(FileRecord {
1602        path: path_to_string(path),
1603        relative_path,
1604        language: Some(language),
1605        size_bytes: metadata.len(),
1606        detected_encoding: Some(encoding),
1607        raw_line_categories: analysis.raw,
1608        effective_counts,
1609        status: match analysis.parse_mode {
1610            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1611            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1612        },
1613        warnings,
1614        generated,
1615        minified,
1616        vendor,
1617        parse_mode: Some(analysis.parse_mode),
1618        submodule: None,
1619        coverage: None,
1620        style_analysis: analysis.style_analysis,
1621        cyclomatic_complexity,
1622        lsloc,
1623        content_hash,
1624    }))
1625}
1626
1627const fn compute_effective_counts(
1628    raw: &RawLineCounts,
1629    mixed_line_policy: MixedLinePolicy,
1630    python_docstrings_as_comments: bool,
1631    count_compiler_directives: bool,
1632) -> EffectiveCounts {
1633    let mut effective = EffectiveCounts {
1634        code_lines: raw.code_only_lines,
1635        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1636        blank_lines: raw.blank_only_lines,
1637        mixed_lines_separate: 0,
1638    };
1639
1640    if python_docstrings_as_comments {
1641        effective.comment_lines += raw.docstring_comment_lines;
1642    } else {
1643        effective.code_lines += raw.docstring_comment_lines;
1644    }
1645
1646    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1647    match mixed_line_policy {
1648        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1649        MixedLinePolicy::CodeAndComment => {
1650            effective.code_lines += mixed_total;
1651            effective.comment_lines += mixed_total;
1652        }
1653        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1654        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1655    }
1656
1657    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1658    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1659    if !count_compiler_directives {
1660        effective.code_lines = effective
1661            .code_lines
1662            .saturating_sub(raw.compiler_directive_lines);
1663    }
1664
1665    effective
1666}
1667
1668fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1669    let mut summary = SummaryTotals {
1670        files_considered: (analyzed.len() + skipped.len()) as u64,
1671        files_analyzed: analyzed.len() as u64,
1672        files_skipped: skipped.len() as u64,
1673        ..Default::default()
1674    };
1675
1676    for record in analyzed {
1677        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1678        summary.code_lines += record.effective_counts.code_lines;
1679        summary.comment_lines += record.effective_counts.comment_lines;
1680        summary.blank_lines += record.effective_counts.blank_lines;
1681        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1682        summary.functions += record.raw_line_categories.functions;
1683        summary.classes += record.raw_line_categories.classes;
1684        summary.variables += record.raw_line_categories.variables;
1685        summary.imports += record.raw_line_categories.imports;
1686        summary.test_count += record.raw_line_categories.test_count;
1687        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1688        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1689        summary.cyclomatic_complexity +=
1690            u64::from(record.raw_line_categories.cyclomatic_complexity);
1691        if let Some(lsloc) = record.raw_line_categories.lsloc {
1692            *summary.lsloc.get_or_insert(0) += u64::from(lsloc);
1693        }
1694        if let Some(cov) = &record.coverage {
1695            summary.coverage_lines_found += u64::from(cov.lines_found);
1696            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1697            summary.coverage_functions_found += u64::from(cov.functions_found);
1698            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1699            summary.coverage_branches_found += u64::from(cov.branches_found);
1700            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1701        }
1702    }
1703
1704    summary
1705}
1706
1707/// Construct a zero-filled `LanguageSummary` for the given language.
1708const fn zeroed_summary(language: Language) -> LanguageSummary {
1709    LanguageSummary {
1710        language,
1711        files: 0,
1712        total_physical_lines: 0,
1713        code_lines: 0,
1714        comment_lines: 0,
1715        blank_lines: 0,
1716        mixed_lines_separate: 0,
1717        functions: 0,
1718        classes: 0,
1719        variables: 0,
1720        imports: 0,
1721        test_count: 0,
1722        test_assertion_count: 0,
1723        test_suite_count: 0,
1724        coverage_lines_found: 0,
1725        coverage_lines_hit: 0,
1726        coverage_functions_found: 0,
1727        coverage_functions_hit: 0,
1728        coverage_branches_found: 0,
1729        coverage_branches_hit: 0,
1730        cyclomatic_complexity: 0,
1731        lsloc: None,
1732    }
1733}
1734
1735/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1736fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1737    entry.files += 1;
1738    let r = &record.raw_line_categories;
1739    entry.total_physical_lines += r.total_physical_lines;
1740    entry.code_lines += record.effective_counts.code_lines;
1741    entry.comment_lines += record.effective_counts.comment_lines;
1742    entry.blank_lines += record.effective_counts.blank_lines;
1743    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1744    entry.functions += r.functions;
1745    entry.classes += r.classes;
1746    entry.variables += r.variables;
1747    entry.imports += r.imports;
1748    entry.test_count += r.test_count;
1749    entry.test_assertion_count += r.test_assertion_count;
1750    entry.test_suite_count += r.test_suite_count;
1751    entry.cyclomatic_complexity += u64::from(r.cyclomatic_complexity);
1752    if let Some(lsloc) = r.lsloc {
1753        *entry.lsloc.get_or_insert(0) += u64::from(lsloc);
1754    }
1755    if let Some(cov) = &record.coverage {
1756        entry.coverage_lines_found += u64::from(cov.lines_found);
1757        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1758        entry.coverage_functions_found += u64::from(cov.functions_found);
1759        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1760        entry.coverage_branches_found += u64::from(cov.branches_found);
1761        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1762    }
1763}
1764
1765fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1766    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1767    for record in analyzed {
1768        let Some(language) = record.language else {
1769            continue;
1770        };
1771        let entry = by_language
1772            .entry(language)
1773            .or_insert_with(|| zeroed_summary(language));
1774        accumulate_record_into_summary(entry, record);
1775    }
1776    by_language.into_values().collect()
1777}
1778
1779fn skipped_record(
1780    path: &Path,
1781    root: &Path,
1782    size_bytes: u64,
1783    status: FileStatus,
1784    warnings: Vec<String>,
1785) -> FileRecord {
1786    FileRecord {
1787        path: path_to_string(path),
1788        relative_path: relative_path_string(path, root),
1789        language: None,
1790        size_bytes,
1791        detected_encoding: None,
1792        raw_line_categories: RawLineCounts::default(),
1793        effective_counts: EffectiveCounts::default(),
1794        status,
1795        warnings,
1796        generated: false,
1797        minified: false,
1798        vendor: false,
1799        parse_mode: None,
1800        submodule: None,
1801        coverage: None,
1802        style_analysis: None,
1803        cyclomatic_complexity: None,
1804        lsloc: None,
1805        content_hash: 0,
1806    }
1807}
1808
1809fn relative_path_string(path: &Path, root: &Path) -> String {
1810    path.strip_prefix(root)
1811        .unwrap_or(path)
1812        .to_string_lossy()
1813        .replace('\\', "/")
1814}
1815
1816fn path_to_string(path: &Path) -> String {
1817    path.to_string_lossy().replace('\\', "/")
1818}
1819
1820/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1821#[must_use]
1822pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1823    let gitmodules = root.join(".gitmodules");
1824    if !gitmodules.is_file() {
1825        return Vec::new();
1826    }
1827    let Ok(content) = fs::read_to_string(&gitmodules) else {
1828        return Vec::new();
1829    };
1830
1831    let mut result = Vec::new();
1832    let mut current_name: Option<String> = None;
1833    let mut current_path: Option<PathBuf> = None;
1834
1835    for line in content.lines() {
1836        let trimmed = line.trim();
1837        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1838            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1839                result.push((name, path));
1840            }
1841            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1842            current_name = Some(name);
1843        } else if let Some(rest) = trimmed.strip_prefix("path") {
1844            if let Some(eq_pos) = rest.find('=') {
1845                let path_str = rest[eq_pos + 1..].trim();
1846                current_path = Some(PathBuf::from(path_str));
1847            }
1848        }
1849    }
1850    if let (Some(name), Some(path)) = (current_name, current_path) {
1851        result.push((name, path));
1852    }
1853
1854    result
1855}
1856
1857fn build_submodule_summaries(
1858    analyzed: &[FileRecord],
1859    submodules: &[(String, PathBuf)],
1860    root: &Path,
1861) -> Vec<SubmoduleSummary> {
1862    submodules
1863        .iter()
1864        .map(|(name, path)| {
1865            let files: Vec<&FileRecord> = analyzed
1866                .iter()
1867                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1868                .collect();
1869
1870            let files_analyzed = files.len() as u64;
1871            let total_physical_lines = files
1872                .iter()
1873                .map(|f| f.raw_line_categories.total_physical_lines)
1874                .sum();
1875            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1876            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1877            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1878            let language_summaries = build_language_summaries_from_slice(&files);
1879
1880            let git = detect_git_for_run(&root.join(path));
1881
1882            SubmoduleSummary {
1883                name: name.clone(),
1884                relative_path: path.to_string_lossy().replace('\\', "/"),
1885                files_analyzed,
1886                total_physical_lines,
1887                code_lines,
1888                comment_lines,
1889                blank_lines,
1890                language_summaries,
1891                git_commit_short: git.commit_short,
1892                git_commit_long: git.commit_long,
1893                git_branch: git.branch,
1894                git_commit_author: git.author,
1895                git_commit_date: git.commit_date,
1896                git_remote_url: git.remote_url,
1897            }
1898        })
1899        .filter(|s| s.files_analyzed > 0)
1900        .collect()
1901}
1902
1903/// Dominant indent label from vote counts.
1904#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1905fn dominant_indent_label(files: &[&StyleAnalysis]) -> String {
1906    let mut votes = [0u32; 6];
1907    for f in files {
1908        let idx = match f.indent_style {
1909            IndentStyle::Tabs => 0,
1910            IndentStyle::Spaces2 => 1,
1911            IndentStyle::Spaces4 => 2,
1912            IndentStyle::Spaces8 => 3,
1913            IndentStyle::Mixed => 4,
1914            IndentStyle::Unknown => 5,
1915        };
1916        votes[idx] += 1;
1917    }
1918    let labels = ["Tabs", "2-Space", "4-Space", "8-Space", "Mixed", "\u{2014}"];
1919    labels[votes
1920        .iter()
1921        .enumerate()
1922        .max_by_key(|(_, v)| *v)
1923        .map_or(5, |(i, _)| i)]
1924    .to_string()
1925}
1926
1927/// Line-80 compliance percentage for a slice of style analyses.
1928#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1929fn line80_pct(files: &[&StyleAnalysis]) -> u8 {
1930    if files.is_empty() {
1931        return 0;
1932    }
1933    let compliant = files
1934        .iter()
1935        .filter(|f| f.total_lines == 0 || (f.lines_over_80 as f32 / f.total_lines as f32) <= 0.05)
1936        .count() as u32;
1937    ((compliant * 100) / files.len() as u32) as u8
1938}
1939
1940/// Column-N compliance percentage using the configured threshold (80, 100, or 120).
1941/// Falls back to the 80-col bucket for any threshold ≤ 80.
1942#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1943fn line_col_pct(files: &[&StyleAnalysis], threshold: u16) -> u8 {
1944    if files.is_empty() {
1945        return 0;
1946    }
1947    let compliant = files
1948        .iter()
1949        .filter(|f| {
1950            let over = if threshold <= 80 {
1951                f.lines_over_80
1952            } else if threshold <= 100 {
1953                f.lines_over_100
1954            } else {
1955                f.lines_over_120
1956            };
1957            f.total_lines == 0 || (over as f32 / f.total_lines as f32) <= 0.05
1958        })
1959        .count() as u32;
1960    ((compliant * 100) / files.len() as u32) as u8
1961}
1962
1963/// Build a `LanguageStyleGroup` from a non-empty slice of `StyleAnalysis` for one family.
1964#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
1965fn build_language_group(
1966    family: &str,
1967    files: &[&StyleAnalysis],
1968    col_threshold: u16,
1969) -> LanguageStyleGroup {
1970    let count = files.len() as u32;
1971
1972    // Collect every unique guide name across all files in this group.
1973    let mut all_names: Vec<String> = Vec::new();
1974    for f in files {
1975        for g in &f.guide_scores {
1976            if !all_names.contains(&g.name) {
1977                all_names.push(g.name.clone());
1978            }
1979        }
1980    }
1981
1982    let mut guide_avg_scores: Vec<(String, u8)> = all_names
1983        .into_iter()
1984        .map(|name| {
1985            let sum: u32 = files
1986                .iter()
1987                .filter_map(|f| f.guide_scores.iter().find(|g| g.name == name))
1988                .map(|g| u32::from(g.score_pct))
1989                .sum();
1990            let avg = (sum / count) as u8;
1991            (name, avg)
1992        })
1993        .collect();
1994    guide_avg_scores.sort_by_key(|s| std::cmp::Reverse(s.1));
1995
1996    let (dominant_guide, dominant_score_pct) = guide_avg_scores
1997        .first()
1998        .map(|(n, s)| (n.clone(), *s))
1999        .unwrap_or_default();
2000
2001    let lcp = line_col_pct(files, col_threshold);
2002    LanguageStyleGroup {
2003        language_family: family.to_string(),
2004        files_count: count,
2005        dominant_guide,
2006        dominant_score_pct,
2007        common_indent_style: dominant_indent_label(files),
2008        guide_avg_scores,
2009        line80_compliant_pct: line80_pct(files),
2010        line_col_compliant_pct: lcp,
2011    }
2012}
2013
2014/// Build aggregate multi-language style-guide adherence.
2015/// Returns `None` when no files had style data.
2016#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
2017fn build_style_summary(analyzed: &[FileRecord], col_threshold: u16) -> Option<StyleSummary> {
2018    let all_style: Vec<&StyleAnalysis> = analyzed
2019        .iter()
2020        .filter_map(|f| f.style_analysis.as_ref())
2021        .collect();
2022
2023    if all_style.is_empty() {
2024        return None;
2025    }
2026
2027    // Group by language_family.
2028    let mut families: std::collections::BTreeMap<&str, Vec<&StyleAnalysis>> =
2029        std::collections::BTreeMap::new();
2030    for sa in &all_style {
2031        families
2032            .entry(sa.language_family.as_str())
2033            .or_default()
2034            .push(sa);
2035    }
2036
2037    let mut by_language: Vec<LanguageStyleGroup> = families
2038        .iter()
2039        .map(|(family, files)| build_language_group(family, files, col_threshold))
2040        .collect();
2041    by_language.sort_by_key(|g| std::cmp::Reverse(g.files_count));
2042
2043    let files_analyzed = all_style.len() as u32;
2044    let common_indent_style = dominant_indent_label(&all_style);
2045    let line80_compliant_pct = line80_pct(&all_style);
2046    let line_col_compliant_pct = line_col_pct(&all_style, col_threshold);
2047
2048    Some(StyleSummary {
2049        files_analyzed,
2050        common_indent_style,
2051        line80_compliant_pct,
2052        line_col_compliant_pct,
2053        col_threshold,
2054        by_language,
2055    })
2056}
2057
2058fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
2059    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
2060    for file in files {
2061        let Some(lang) = file.language else { continue };
2062        let entry = map
2063            .entry(lang.display_name().to_string())
2064            .or_insert_with(|| zeroed_summary(lang));
2065        accumulate_record_into_summary(entry, file);
2066    }
2067    map.into_values().collect()
2068}
2069
2070fn file_name_eq(path: &Path, expected: &str) -> bool {
2071    path.file_name()
2072        .and_then(|name| name.to_str())
2073        .is_some_and(|name| name == expected)
2074}
2075
2076fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
2077    path.components().any(|component| {
2078        component
2079            .as_os_str()
2080            .to_str()
2081            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
2082    })
2083}
2084
2085fn is_vendor_path(path: &Path) -> bool {
2086    path.components().any(|component| {
2087        component
2088            .as_os_str()
2089            .to_str()
2090            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
2091    })
2092}
2093
2094fn is_known_lockfile(path: &Path) -> bool {
2095    path.file_name()
2096        .and_then(|name| name.to_str())
2097        .is_some_and(|name| {
2098            matches!(
2099                name,
2100                "Cargo.lock"
2101                    | "package-lock.json"
2102                    | "yarn.lock"
2103                    | "pnpm-lock.yaml"
2104                    | "Pipfile.lock"
2105                    | "poetry.lock"
2106                    | "composer.lock"
2107            )
2108        })
2109}
2110
2111fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
2112    let file_name = path
2113        .file_name()
2114        .and_then(|name| name.to_str())
2115        .unwrap_or_default();
2116    if file_name.contains(".generated.") || file_name.contains(".g.") {
2117        return true;
2118    }
2119
2120    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
2121        .to_ascii_lowercase();
2122    sample.contains("@generated") || sample.contains("generated by")
2123}
2124
2125fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
2126    let file_name = path
2127        .file_name()
2128        .and_then(|name| name.to_str())
2129        .unwrap_or_default();
2130    if file_name.contains(".min.") {
2131        return true;
2132    }
2133
2134    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
2135    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
2136    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
2137    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
2138}
2139
2140fn is_binary(bytes: &[u8]) -> bool {
2141    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
2142        || bytes.starts_with(&[0xFF, 0xFE])
2143        || bytes.starts_with(&[0xFE, 0xFF])
2144    {
2145        return false;
2146    }
2147
2148    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
2149    sample.contains(&0)
2150}
2151
2152/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
2153/// Returns `(text, encoding_label, warnings)`.
2154fn decode_utf16_bom(
2155    bom_stripped: &[u8],
2156    encoding: &'static encoding_rs::Encoding,
2157    label: &str,
2158) -> (String, String, Vec<String>) {
2159    let (cow, _, had_errors) = encoding.decode(bom_stripped);
2160    let mut warnings = Vec::new();
2161    if had_errors {
2162        warnings.push(format!("{label} decode contained replacement characters"));
2163    }
2164    (cow.into_owned(), label.into(), warnings)
2165}
2166
2167fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
2168    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
2169        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
2170        return Ok((text, "utf-8-bom".into(), vec![]));
2171    }
2172    if bytes.starts_with(&[0xFF, 0xFE]) {
2173        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
2174    }
2175    if bytes.starts_with(&[0xFE, 0xFF]) {
2176        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
2177    }
2178
2179    // Multiple statements in the else branch make map_or_else awkward here.
2180    #[allow(clippy::option_if_let_else)]
2181    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
2182        Ok((text, "utf-8".into(), vec![]))
2183    } else {
2184        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
2185        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
2186        if had_errors {
2187            warnings.push("fallback decode contained replacement characters".into());
2188        }
2189        Ok((cow.into_owned(), "windows-1252".into(), warnings))
2190    }
2191}
2192
2193fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
2194    if patterns.is_empty() {
2195        return Ok(None);
2196    }
2197
2198    let mut builder = GlobSetBuilder::new();
2199    for pattern in patterns {
2200        builder
2201            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
2202    }
2203    Ok(Some(
2204        builder.build().context("failed to compile glob filters")?,
2205    ))
2206}
2207
2208fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
2209    if enabled.is_empty() {
2210        return Ok(None);
2211    }
2212
2213    let supported = supported_languages();
2214    let mut set = BTreeSet::new();
2215    for name in enabled {
2216        let language = Language::from_name(name)
2217            .with_context(|| format!("unsupported language in config: {name}"))?;
2218        if !supported.contains(&language) {
2219            anyhow::bail!("language {name} is not supported in this build");
2220        }
2221        set.insert(language);
2222    }
2223    Ok(Some(set))
2224}
2225
2226/// # Errors
2227///
2228/// Returns an error if serialization fails or the output file cannot be written.
2229pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
2230    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
2231    fs::write(output_path, json)
2232        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
2233}
2234
2235/// # Errors
2236///
2237/// Returns an error if the file cannot be read or the JSON cannot be parsed.
2238pub fn read_json(path: &Path) -> Result<AnalysisRun> {
2239    let contents = fs::read_to_string(path)
2240        .with_context(|| format!("failed to read result file {}", path.display()))?;
2241    serde_json::from_str(&contents)
2242        .with_context(|| format!("failed to parse JSON result {}", path.display()))
2243}
2244
2245#[cfg(test)]
2246mod tests {
2247    use super::*;
2248
2249    #[test]
2250    fn effective_counts_respect_code_only_policy() {
2251        let raw = RawLineCounts {
2252            code_only_lines: 2,
2253            single_comment_only_lines: 1,
2254            mixed_code_single_comment_lines: 3,
2255            docstring_comment_lines: 2,
2256            ..RawLineCounts::default()
2257        };
2258        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
2259        assert_eq!(counts.code_lines, 5);
2260        assert_eq!(counts.comment_lines, 3);
2261    }
2262
2263    #[test]
2264    fn effective_counts_can_separate_mixed() {
2265        let raw = RawLineCounts {
2266            mixed_code_single_comment_lines: 2,
2267            mixed_code_multi_comment_lines: 1,
2268            ..RawLineCounts::default()
2269        };
2270        let counts =
2271            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
2272        assert_eq!(counts.mixed_lines_separate, 3);
2273        assert_eq!(counts.code_lines, 0);
2274        assert_eq!(counts.comment_lines, 0);
2275    }
2276
2277    #[test]
2278    fn windows_1252_fallback_decodes() {
2279        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
2280        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
2281        assert_eq!(encoding, "windows-1252");
2282        assert!(text.contains('–'));
2283        assert!(!warnings.is_empty());
2284    }
2285
2286    // ── Pure predicate tests ─────────────────────────────────────────────────
2287
2288    #[test]
2289    fn is_binary_detects_null_byte() {
2290        let bytes = b"hello\x00world";
2291        assert!(is_binary(bytes));
2292    }
2293
2294    #[test]
2295    fn is_binary_clean_text_is_not_binary() {
2296        let bytes = b"fn main() { println!(\"hello\"); }";
2297        assert!(!is_binary(bytes));
2298    }
2299
2300    #[test]
2301    fn is_binary_utf8_bom_not_binary() {
2302        let bytes = b"\xef\xbb\xbffn main() {}";
2303        assert!(!is_binary(bytes));
2304    }
2305
2306    #[test]
2307    fn looks_generated_at_generated_marker() {
2308        let bytes = b"// @generated by protoc-gen-rust\nfn foo() {}";
2309        assert!(looks_generated(Path::new("foo.rs"), bytes));
2310    }
2311
2312    #[test]
2313    fn looks_generated_do_not_edit_marker() {
2314        // "Code generated by" triggers detection (contains the "generated by" substring).
2315        let bytes = b"// Code generated by build.rs. DO NOT EDIT.\nuse foo;";
2316        assert!(looks_generated(Path::new("foo.rs"), bytes));
2317        // @generated also triggers detection independently.
2318        let bytes2 = b"// @generated\nuse foo;";
2319        assert!(looks_generated(Path::new("foo.rs"), bytes2));
2320    }
2321
2322    #[test]
2323    fn looks_generated_normal_file_not_generated() {
2324        let bytes = b"fn main() {\n    println!(\"hello\");\n}\n";
2325        assert!(!looks_generated(Path::new("main.rs"), bytes));
2326    }
2327
2328    #[test]
2329    fn looks_minified_dot_min_filename() {
2330        let bytes = b"function a(){return 1}";
2331        assert!(looks_minified(Path::new("bundle.min.js"), bytes));
2332    }
2333
2334    #[test]
2335    fn looks_minified_normal_file_not_minified() {
2336        let bytes = b"function hello() {\n    return 1;\n}\n";
2337        assert!(!looks_minified(Path::new("app.js"), bytes));
2338    }
2339
2340    #[test]
2341    fn looks_minified_very_long_line() {
2342        let long_line: Vec<u8> = b"x".repeat(MINIFIED_LINE_THRESHOLD + 1);
2343        assert!(looks_minified(Path::new("app.js"), &long_line));
2344    }
2345
2346    #[test]
2347    fn is_known_lockfile_cargo_lock() {
2348        assert!(is_known_lockfile(Path::new("Cargo.lock")));
2349    }
2350
2351    #[test]
2352    fn is_known_lockfile_package_lock_json() {
2353        assert!(is_known_lockfile(Path::new("package-lock.json")));
2354    }
2355
2356    #[test]
2357    fn is_known_lockfile_yarn_lock() {
2358        assert!(is_known_lockfile(Path::new("yarn.lock")));
2359    }
2360
2361    #[test]
2362    fn is_known_lockfile_normal_file_is_not_lockfile() {
2363        assert!(!is_known_lockfile(Path::new("src/lib.rs")));
2364    }
2365
2366    #[test]
2367    fn is_vendor_path_node_modules() {
2368        assert!(is_vendor_path(Path::new("node_modules/react/index.js")));
2369    }
2370
2371    #[test]
2372    fn is_vendor_path_vendor_dir() {
2373        assert!(is_vendor_path(Path::new("vendor/anyhow/src/lib.rs")));
2374    }
2375
2376    #[test]
2377    fn is_vendor_path_normal_src_is_not_vendor() {
2378        assert!(!is_vendor_path(Path::new("src/lib.rs")));
2379    }
2380
2381    #[test]
2382    fn is_excluded_dir_path_matches_excluded() {
2383        let excluded = vec![".git".into(), "target".into()];
2384        assert!(is_excluded_dir_path(Path::new(".git/config"), &excluded));
2385    }
2386
2387    #[test]
2388    fn is_excluded_dir_path_non_excluded_is_ok() {
2389        let excluded = vec![".git".into(), "target".into()];
2390        assert!(!is_excluded_dir_path(Path::new("src/main.rs"), &excluded));
2391    }
2392
2393    #[test]
2394    fn decode_bytes_utf8_bom_stripped() {
2395        let bytes = b"\xef\xbb\xbffn main() {}";
2396        let (text, encoding, _) = decode_bytes(bytes).unwrap();
2397        // BOM is detected — encoding label includes "bom" indicator
2398        assert!(
2399            encoding.contains("utf-8"),
2400            "should be utf-8 variant, got {encoding}"
2401        );
2402        assert!(text.starts_with("fn"));
2403    }
2404
2405    #[test]
2406    fn decode_bytes_plain_utf8() {
2407        let bytes = b"hello world";
2408        let (text, encoding, warnings) = decode_bytes(bytes).unwrap();
2409        assert_eq!(encoding, "utf-8");
2410        assert_eq!(text, "hello world");
2411        assert!(warnings.is_empty());
2412    }
2413
2414    // ── UTF-16 BOM decoding ──────────────────────────────────────────────────
2415
2416    #[test]
2417    fn decode_bytes_utf16le_bom() {
2418        // Encode "hi" as UTF-16 LE with BOM: FF FE 68 00 69 00
2419        let mut bytes = vec![0xFF, 0xFE];
2420        for ch in "hi\n".encode_utf16() {
2421            bytes.extend_from_slice(&ch.to_le_bytes());
2422        }
2423        let (text, encoding, _warnings) = decode_bytes(&bytes).unwrap();
2424        assert_eq!(encoding, "utf-16le");
2425        assert!(text.contains('h') && text.contains('i'));
2426    }
2427
2428    #[test]
2429    fn decode_bytes_utf16be_bom() {
2430        // Encode "ok" as UTF-16 BE with BOM: FE FF 00 6F 00 6B
2431        let mut bytes = vec![0xFE, 0xFF];
2432        for ch in "ok\n".encode_utf16() {
2433            bytes.extend_from_slice(&ch.to_be_bytes());
2434        }
2435        let (text, encoding, _warnings) = decode_bytes(&bytes).unwrap();
2436        assert_eq!(encoding, "utf-16be");
2437        assert!(text.contains('o') && text.contains('k'));
2438    }
2439
2440    #[test]
2441    fn is_binary_utf16le_bom_not_binary() {
2442        // UTF-16 LE BOM followed by null bytes — should NOT be binary
2443        let bytes = &[0xFF, 0xFE, 0x68, 0x00];
2444        assert!(!is_binary(bytes));
2445    }
2446
2447    #[test]
2448    fn is_binary_utf16be_bom_not_binary() {
2449        let bytes = &[0xFE, 0xFF, 0x00, 0x68];
2450        assert!(!is_binary(bytes));
2451    }
2452
2453    // ── MixedLinePolicy branches ─────────────────────────────────────────────
2454
2455    #[test]
2456    fn effective_counts_code_and_comment_policy() {
2457        let raw = RawLineCounts {
2458            mixed_code_single_comment_lines: 3,
2459            mixed_code_multi_comment_lines: 2,
2460            ..RawLineCounts::default()
2461        };
2462        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeAndComment, true, true);
2463        // Both code and comment incremented by mixed_total (5)
2464        assert_eq!(counts.code_lines, 5);
2465        assert_eq!(counts.comment_lines, 5);
2466        assert_eq!(counts.mixed_lines_separate, 0);
2467    }
2468
2469    #[test]
2470    fn effective_counts_comment_only_policy() {
2471        let raw = RawLineCounts {
2472            mixed_code_single_comment_lines: 4,
2473            mixed_code_multi_comment_lines: 1,
2474            ..RawLineCounts::default()
2475        };
2476        let counts = compute_effective_counts(&raw, MixedLinePolicy::CommentOnly, true, true);
2477        assert_eq!(counts.code_lines, 0);
2478        assert_eq!(counts.comment_lines, 5);
2479        assert_eq!(counts.mixed_lines_separate, 0);
2480    }
2481
2482    #[test]
2483    fn effective_counts_docstrings_as_code_when_flag_false() {
2484        let raw = RawLineCounts {
2485            code_only_lines: 10,
2486            docstring_comment_lines: 3,
2487            ..RawLineCounts::default()
2488        };
2489        // python_docstrings_as_comments = false → docstrings counted as code
2490        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, false, true);
2491        assert_eq!(counts.code_lines, 13);
2492        assert_eq!(counts.comment_lines, 0);
2493    }
2494
2495    #[test]
2496    fn effective_counts_exclude_compiler_directives() {
2497        let raw = RawLineCounts {
2498            code_only_lines: 10,
2499            compiler_directive_lines: 3,
2500            ..RawLineCounts::default()
2501        };
2502        // count_compiler_directives = false → subtract directive lines from code
2503        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, false);
2504        assert_eq!(counts.code_lines, 7);
2505    }
2506
2507    #[test]
2508    fn effective_counts_directives_not_subtracted_below_zero() {
2509        let raw = RawLineCounts {
2510            code_only_lines: 2,
2511            compiler_directive_lines: 5, // more than code — saturating_sub
2512            ..RawLineCounts::default()
2513        };
2514        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, false);
2515        assert_eq!(counts.code_lines, 0); // saturated at 0
2516    }
2517
2518    // ── COCOMO modes ─────────────────────────────────────────────────────────
2519
2520    #[test]
2521    fn cocomo_organic_computes_positive_values() {
2522        let est = compute_cocomo(5_000, CocomoMode::Organic);
2523        assert!(est.ksloc > 0.0);
2524        assert!(est.effort_person_months > 0.0);
2525        assert!(est.duration_months > 0.0);
2526        assert!(est.avg_staff > 0.0);
2527        assert_eq!(est.mode, CocomoMode::Organic);
2528    }
2529
2530    #[test]
2531    fn cocomo_semi_detached_computes_positive_values() {
2532        let est = compute_cocomo(20_000, CocomoMode::SemiDetached);
2533        assert!(est.ksloc > 0.0);
2534        assert!(est.effort_person_months > 0.0);
2535        assert!(est.duration_months > 0.0);
2536        assert_eq!(est.mode, CocomoMode::SemiDetached);
2537    }
2538
2539    #[test]
2540    fn cocomo_embedded_computes_positive_values() {
2541        let est = compute_cocomo(100_000, CocomoMode::Embedded);
2542        assert!(est.effort_person_months > 0.0);
2543        assert_eq!(est.mode, CocomoMode::Embedded);
2544    }
2545
2546    #[test]
2547    fn cocomo_zero_lines_produces_zero_effort() {
2548        let est = compute_cocomo(0, CocomoMode::Organic);
2549        assert!((est.ksloc).abs() < f64::EPSILON);
2550        // Zero KSLOC → effort = 2.4 * 0^1.05 = 0
2551        assert!((est.effort_person_months - 0.0).abs() < 0.01);
2552    }
2553
2554    // ── Path / git helpers ────────────────────────────────────────────────────
2555
2556    #[test]
2557    fn parse_url_line_extracts_url() {
2558        assert_eq!(
2559            parse_url_line("url = https://example.com/repo.git"),
2560            Some("https://example.com/repo.git")
2561        );
2562    }
2563
2564    #[test]
2565    fn parse_url_line_returns_none_for_non_url_key() {
2566        assert_eq!(
2567            parse_url_line("fetch = +refs/heads/*:refs/remotes/origin/*"),
2568            None
2569        );
2570    }
2571
2572    #[test]
2573    fn parse_url_line_returns_none_for_empty_url() {
2574        assert_eq!(parse_url_line("url = "), None);
2575    }
2576
2577    #[test]
2578    fn looks_generated_generated_filename_extension() {
2579        // Files with ".generated." in name are detected without reading bytes
2580        let bytes = b"// normal code\n";
2581        assert!(looks_generated(Path::new("schema.generated.ts"), bytes));
2582    }
2583
2584    #[test]
2585    fn looks_generated_dot_g_extension() {
2586        let bytes = b"// normal code\n";
2587        assert!(looks_generated(Path::new("parser.g.cs"), bytes));
2588    }
2589
2590    #[test]
2591    fn looks_minified_whitespace_ratio_is_ok() {
2592        // Low whitespace ratio but NOT over the line length threshold → not minified
2593        let normal = b"var x=1,y=2,z=3;\n";
2594        assert!(!looks_minified(Path::new("app.js"), normal));
2595    }
2596
2597    #[test]
2598    fn is_known_lockfile_pnpm() {
2599        assert!(is_known_lockfile(Path::new("pnpm-lock.yaml")));
2600    }
2601
2602    #[test]
2603    fn is_known_lockfile_pipfile() {
2604        assert!(is_known_lockfile(Path::new("Pipfile.lock")));
2605    }
2606
2607    #[test]
2608    fn is_known_lockfile_poetry() {
2609        assert!(is_known_lockfile(Path::new("poetry.lock")));
2610    }
2611
2612    #[test]
2613    fn is_known_lockfile_composer() {
2614        assert!(is_known_lockfile(Path::new("composer.lock")));
2615    }
2616
2617    // ── relative_path_string and path_to_string ──────────────────────────────
2618
2619    #[test]
2620    fn relative_path_string_strips_root_prefix() {
2621        let path = Path::new("/tmp/project/src/lib.rs");
2622        let root = Path::new("/tmp/project");
2623        let rel = relative_path_string(path, root);
2624        assert_eq!(rel, "src/lib.rs");
2625    }
2626
2627    #[test]
2628    fn relative_path_string_falls_back_to_full_path() {
2629        // When path is not under root, fall back to path itself
2630        let path = Path::new("/other/dir/file.rs");
2631        let root = Path::new("/tmp/project");
2632        let rel = relative_path_string(path, root);
2633        // Should not panic; returns path representation
2634        assert!(!rel.is_empty());
2635    }
2636
2637    // ── find_duplicate_groups ────────────────────────────────────────────────
2638
2639    #[test]
2640    fn find_duplicate_groups_returns_empty_for_unique_hashes() {
2641        use sloc_languages::{Language, ParseMode, RawLineCounts};
2642        let make_rec = |hash: u64, path: &str| FileRecord {
2643            path: path.into(),
2644            relative_path: path.into(),
2645            language: Some(Language::Rust),
2646            size_bytes: 10,
2647            detected_encoding: Some("utf-8".into()),
2648            raw_line_categories: RawLineCounts::default(),
2649            effective_counts: EffectiveCounts::default(),
2650            status: FileStatus::AnalyzedExact,
2651            warnings: vec![],
2652            generated: false,
2653            minified: false,
2654            vendor: false,
2655            parse_mode: Some(ParseMode::Lexical),
2656            submodule: None,
2657            coverage: None,
2658            style_analysis: None,
2659            cyclomatic_complexity: None,
2660            lsloc: None,
2661            content_hash: hash,
2662        };
2663        let analyzed = vec![make_rec(111, "a.rs"), make_rec(222, "b.rs")];
2664        let groups = find_duplicate_groups(&analyzed);
2665        assert!(groups.is_empty());
2666    }
2667
2668    #[test]
2669    fn find_duplicate_groups_returns_group_for_same_hash() {
2670        use sloc_languages::{Language, ParseMode, RawLineCounts};
2671        let make_rec = |hash: u64, path: &str| FileRecord {
2672            path: path.into(),
2673            relative_path: path.into(),
2674            language: Some(Language::Rust),
2675            size_bytes: 10,
2676            detected_encoding: Some("utf-8".into()),
2677            raw_line_categories: RawLineCounts::default(),
2678            effective_counts: EffectiveCounts::default(),
2679            status: FileStatus::AnalyzedExact,
2680            warnings: vec![],
2681            generated: false,
2682            minified: false,
2683            vendor: false,
2684            parse_mode: Some(ParseMode::Lexical),
2685            submodule: None,
2686            coverage: None,
2687            style_analysis: None,
2688            cyclomatic_complexity: None,
2689            lsloc: None,
2690            content_hash: hash,
2691        };
2692        let analyzed = vec![
2693            make_rec(999, "a.rs"),
2694            make_rec(999, "b.rs"),
2695            make_rec(123, "c.rs"),
2696        ];
2697        let groups = find_duplicate_groups(&analyzed);
2698        assert_eq!(groups.len(), 1);
2699        assert_eq!(groups[0].len(), 2);
2700    }
2701
2702    #[test]
2703    fn find_duplicate_groups_ignores_zero_hash() {
2704        use sloc_languages::{Language, ParseMode, RawLineCounts};
2705        let make_rec = |hash: u64, path: &str| FileRecord {
2706            path: path.into(),
2707            relative_path: path.into(),
2708            language: Some(Language::Rust),
2709            size_bytes: 10,
2710            detected_encoding: Some("utf-8".into()),
2711            raw_line_categories: RawLineCounts::default(),
2712            effective_counts: EffectiveCounts::default(),
2713            status: FileStatus::AnalyzedExact,
2714            warnings: vec![],
2715            generated: false,
2716            minified: false,
2717            vendor: false,
2718            parse_mode: Some(ParseMode::Lexical),
2719            submodule: None,
2720            coverage: None,
2721            style_analysis: None,
2722            cyclomatic_complexity: None,
2723            lsloc: None,
2724            content_hash: hash,
2725        };
2726        // hash=0 means "not computed" — must be excluded from duplicate detection
2727        let analyzed = vec![make_rec(0, "a.rs"), make_rec(0, "b.rs")];
2728        let groups = find_duplicate_groups(&analyzed);
2729        assert!(
2730            groups.is_empty(),
2731            "zero-hash files must not be grouped as duplicates"
2732        );
2733    }
2734
2735    // ── detect_submodules ────────────────────────────────────────────────────
2736
2737    #[test]
2738    fn detect_submodules_no_gitmodules_returns_empty() {
2739        let dir = tempfile::tempdir().unwrap();
2740        let result = detect_submodules(dir.path());
2741        assert!(result.is_empty());
2742    }
2743
2744    #[test]
2745    fn detect_submodules_parses_gitmodules_file() {
2746        let dir = tempfile::tempdir().unwrap();
2747        let content = "[submodule \"vendor/lib\"]\n\tpath = vendor/lib\n\turl = https://github.com/example/lib.git\n";
2748        std::fs::write(dir.path().join(".gitmodules"), content).unwrap();
2749        let result = detect_submodules(dir.path());
2750        assert_eq!(result.len(), 1);
2751        assert_eq!(result[0].0, "vendor/lib");
2752    }
2753
2754    // ── write_json / read_json roundtrip ─────────────────────────────────────
2755
2756    #[test]
2757    fn write_json_read_json_roundtrip() {
2758        use chrono::Utc;
2759        use sloc_config::AppConfig;
2760        use sloc_languages::{Language, ParseMode, RawLineCounts};
2761        let dir = tempfile::tempdir().unwrap();
2762        let run = AnalysisRun {
2763            tool: ToolMetadata {
2764                name: "sloc".into(),
2765                version: "0.0.1".into(),
2766                run_id: "test-roundtrip".into(),
2767                timestamp_utc: Utc::now(),
2768            },
2769            environment: EnvironmentMetadata {
2770                operating_system: "test".into(),
2771                architecture: "x86_64".into(),
2772                runtime_mode: "test".into(),
2773                initiator_username: "tester".into(),
2774                initiator_hostname: "testhost".into(),
2775                ci_name: None,
2776            },
2777            effective_configuration: AppConfig::default(),
2778            input_roots: vec!["/tmp/test".into()],
2779            summary_totals: SummaryTotals {
2780                files_analyzed: 1,
2781                code_lines: 5,
2782                ..SummaryTotals::default()
2783            },
2784            totals_by_language: vec![],
2785            per_file_records: vec![FileRecord {
2786                path: "a.rs".into(),
2787                relative_path: "a.rs".into(),
2788                language: Some(Language::Rust),
2789                size_bytes: 50,
2790                detected_encoding: Some("utf-8".into()),
2791                raw_line_categories: RawLineCounts {
2792                    code_only_lines: 5,
2793                    ..RawLineCounts::default()
2794                },
2795                effective_counts: EffectiveCounts {
2796                    code_lines: 5,
2797                    ..EffectiveCounts::default()
2798                },
2799                status: FileStatus::AnalyzedExact,
2800                warnings: vec![],
2801                generated: false,
2802                minified: false,
2803                vendor: false,
2804                parse_mode: Some(ParseMode::Lexical),
2805                submodule: None,
2806                coverage: None,
2807                style_analysis: None,
2808                cyclomatic_complexity: None,
2809                lsloc: None,
2810                content_hash: 0,
2811            }],
2812            skipped_file_records: vec![],
2813            warnings: vec![],
2814            submodule_summaries: vec![],
2815            git_commit_short: Some("abc1234".into()),
2816            git_branch: Some("main".into()),
2817            git_commit_long: None,
2818            git_commit_author: None,
2819            git_tags: None,
2820            git_nearest_tag: None,
2821            git_commit_date: None,
2822            git_remote_url: None,
2823            style_summary: None,
2824            cocomo: None,
2825            uloc: 0,
2826            dryness_pct: None,
2827            duplicate_groups: vec![],
2828            duplicates_excluded: 0,
2829        };
2830        let json_path = dir.path().join("test.json");
2831        write_json(&run, &json_path).unwrap();
2832        let loaded = read_json(&json_path).unwrap();
2833        assert_eq!(loaded.summary_totals.files_analyzed, 1);
2834        assert_eq!(loaded.summary_totals.code_lines, 5);
2835        assert_eq!(loaded.git_commit_short.as_deref(), Some("abc1234"));
2836        assert_eq!(loaded.git_branch.as_deref(), Some("main"));
2837        assert_eq!(loaded.per_file_records.len(), 1);
2838    }
2839
2840    // ── detect_ci_system ─────────────────────────────────────────────────────
2841
2842    #[test]
2843    fn detect_ci_system_returns_none_without_env_vars() {
2844        // Remove known CI env vars so detection returns None
2845        for var in &[
2846            "JENKINS_URL",
2847            "JENKINS_HOME",
2848            "BUILD_URL",
2849            "GITHUB_ACTIONS",
2850            "GITLAB_CI",
2851            "CIRCLECI",
2852            "TRAVIS",
2853            "TF_BUILD",
2854            "TEAMCITY_VERSION",
2855        ] {
2856            std::env::remove_var(var);
2857        }
2858        // Result depends on test runner env; just assert no panic
2859        let _ = detect_ci_system();
2860    }
2861
2862    // ── resolve_git_file_pointer ──────────────────────────────────────────────
2863
2864    #[test]
2865    fn resolve_git_file_pointer_valid_absolute_gitdir() {
2866        let dir = tempfile::tempdir().unwrap();
2867        // Create a real target directory (the "real" git dir)
2868        let real_git = dir.path().join("real.git");
2869        fs::create_dir_all(&real_git).unwrap();
2870        // Write a .git file pointing at the real git dir
2871        let git_file = dir.path().join(".git");
2872        fs::write(&git_file, format!("gitdir: {}\n", real_git.display())).unwrap();
2873
2874        let result = resolve_git_file_pointer(&git_file, dir.path());
2875        // Should resolve to the real git dir (or its canonicalized form)
2876        assert!(
2877            result.is_some(),
2878            "should resolve a valid absolute gitdir pointer"
2879        );
2880        assert!(result.unwrap().is_dir());
2881    }
2882
2883    #[test]
2884    fn resolve_git_file_pointer_missing_gitdir_prefix_returns_none() {
2885        let dir = tempfile::tempdir().unwrap();
2886        let git_file = dir.path().join(".git");
2887        fs::write(&git_file, "not a gitdir line\n").unwrap();
2888        assert!(resolve_git_file_pointer(&git_file, dir.path()).is_none());
2889    }
2890
2891    #[test]
2892    fn resolve_git_file_pointer_unreadable_path_returns_none() {
2893        assert!(resolve_git_file_pointer(
2894            Path::new("/nonexistent/__sloc_test_git_file__"),
2895            Path::new("/nonexistent")
2896        )
2897        .is_none());
2898    }
2899
2900    #[test]
2901    fn resolve_git_file_pointer_nonexistent_target_returns_none() {
2902        let dir = tempfile::tempdir().unwrap();
2903        let git_file = dir.path().join(".git");
2904        fs::write(&git_file, "gitdir: /nonexistent/__sloc_fake_gitdir_xyz__\n").unwrap();
2905        // Target does not exist → returns None
2906        assert!(resolve_git_file_pointer(&git_file, dir.path()).is_none());
2907    }
2908
2909    #[test]
2910    fn resolve_git_file_pointer_relative_path() {
2911        let dir = tempfile::tempdir().unwrap();
2912        let real_git = dir.path().join("real_git_dir");
2913        fs::create_dir_all(&real_git).unwrap();
2914        let git_file = dir.path().join(".git");
2915        // Relative path — should be resolved relative to base_dir
2916        fs::write(&git_file, "gitdir: real_git_dir\n").unwrap();
2917        let result = resolve_git_file_pointer(&git_file, dir.path());
2918        assert!(result.is_some());
2919    }
2920
2921    // ── resolve_ref ──────────────────────────────────────────────────────────
2922
2923    #[test]
2924    fn resolve_ref_from_loose_file() {
2925        let dir = tempfile::tempdir().unwrap();
2926        let git_dir = dir.path();
2927        fs::create_dir_all(git_dir.join("refs/heads")).unwrap();
2928        let sha = "abc1234567890abcdef1234567890abcdef123456";
2929        fs::write(git_dir.join("refs/heads/main"), format!("{sha}\n")).unwrap();
2930
2931        let result = resolve_ref(git_dir, "refs/heads/main");
2932        assert_eq!(result.as_deref(), Some(sha));
2933    }
2934
2935    #[test]
2936    fn resolve_ref_from_packed_refs() {
2937        let dir = tempfile::tempdir().unwrap();
2938        let git_dir = dir.path();
2939        let sha = "def5678def5678def5678def5678def5678def56";
2940        fs::write(
2941            git_dir.join("packed-refs"),
2942            format!("# pack-refs with: peeled fully-peeled sorted\n{sha} refs/heads/feature\n"),
2943        )
2944        .unwrap();
2945
2946        let result = resolve_ref(git_dir, "refs/heads/feature");
2947        assert_eq!(result.as_deref(), Some(sha));
2948    }
2949
2950    #[test]
2951    fn resolve_ref_not_found_returns_none() {
2952        let dir = tempfile::tempdir().unwrap();
2953        let result = resolve_ref(dir.path(), "refs/heads/nonexistent-branch-xyz");
2954        assert!(result.is_none());
2955    }
2956
2957    #[test]
2958    fn resolve_ref_packed_refs_skips_comment_and_peeled() {
2959        let dir = tempfile::tempdir().unwrap();
2960        let git_dir = dir.path();
2961        let sha = "aaa1111aaa1111aaa1111aaa1111aaa1111aaa11";
2962        fs::write(
2963            git_dir.join("packed-refs"),
2964            format!("# comment\n^peeled-object-sha\n{sha} refs/tags/v1.0\n"),
2965        )
2966        .unwrap();
2967
2968        let result = resolve_ref(git_dir, "refs/tags/v1.0");
2969        assert_eq!(result.as_deref(), Some(sha));
2970    }
2971
2972    #[test]
2973    fn resolve_ref_loose_sha_too_short_falls_through_to_packed() {
2974        let dir = tempfile::tempdir().unwrap();
2975        let git_dir = dir.path();
2976        fs::create_dir_all(git_dir.join("refs/heads")).unwrap();
2977        // Write an invalid (too short) SHA to the loose file
2978        fs::write(git_dir.join("refs/heads/main"), "short\n").unwrap();
2979        // No packed-refs → None
2980        let result = resolve_ref(git_dir, "refs/heads/main");
2981        assert!(result.is_none());
2982    }
2983
2984    // ── read_git_remote_url ───────────────────────────────────────────────────
2985
2986    #[test]
2987    fn read_git_remote_url_parses_origin_url() {
2988        let dir = tempfile::tempdir().unwrap();
2989        let git_dir = dir.path().join(".git");
2990        fs::create_dir_all(&git_dir).unwrap();
2991        fs::write(
2992            git_dir.join("config"),
2993            "[core]\n\trepositoryformatversion = 0\n[remote \"origin\"]\n\turl = https://github.com/org/repo.git\n\tfetch = +refs/heads/*:refs/remotes/origin/*\n",
2994        )
2995        .unwrap();
2996        let url = read_git_remote_url(&git_dir);
2997        assert_eq!(url.as_deref(), Some("https://github.com/org/repo.git"));
2998    }
2999
3000    #[test]
3001    fn read_git_remote_url_no_config_returns_none() {
3002        let dir = tempfile::tempdir().unwrap();
3003        let git_dir = dir.path().join(".git");
3004        fs::create_dir_all(&git_dir).unwrap();
3005        // No config file
3006        let url = read_git_remote_url(&git_dir);
3007        assert!(url.is_none());
3008    }
3009
3010    // ── detect_git_for_run — HEAD edge cases ──────────────────────────────────
3011
3012    #[test]
3013    fn detect_git_for_run_no_git_dir_returns_default() {
3014        let dir = tempfile::tempdir().unwrap();
3015        // No .git directory or file
3016        let info = detect_git_for_run(dir.path());
3017        assert!(info.commit_long.is_none());
3018    }
3019
3020    #[test]
3021    fn detect_git_for_run_unreadable_head_returns_default() {
3022        let dir = tempfile::tempdir().unwrap();
3023        let git_dir = dir.path().join(".git");
3024        fs::create_dir_all(&git_dir).unwrap();
3025        // .git directory exists but no HEAD file → read fails → early return
3026        let info = detect_git_for_run(dir.path());
3027        assert!(info.commit_long.is_none());
3028    }
3029
3030    #[test]
3031    fn detect_git_for_run_detached_head_with_sha() {
3032        let dir = tempfile::tempdir().unwrap();
3033        let git_dir = dir.path().join(".git");
3034        fs::create_dir_all(&git_dir).unwrap();
3035        // Exactly 40 hex chars — the code checks len >= 40 and takes [..40]
3036        let sha = "abc1234567890abcdef1234567890abcdef12345";
3037        fs::write(git_dir.join("HEAD"), sha).unwrap();
3038        let info = detect_git_for_run(dir.path());
3039        // Detached HEAD — commit_long should be the first 40 chars of HEAD
3040        assert_eq!(info.commit_long.as_deref(), Some(sha));
3041        assert_eq!(info.commit_short.as_deref(), Some("abc1234"));
3042    }
3043
3044    #[test]
3045    fn detect_git_for_run_with_packed_ref() {
3046        let dir = tempfile::tempdir().unwrap();
3047        let git_dir = dir.path().join(".git");
3048        fs::create_dir_all(&git_dir).unwrap();
3049        // HEAD points to a ref resolved via packed-refs
3050        fs::write(git_dir.join("HEAD"), "ref: refs/heads/main\n").unwrap();
3051        let sha = "deadbeef00000000000000000000000000000000";
3052        fs::write(
3053            git_dir.join("packed-refs"),
3054            format!("# pack-refs\n{sha} refs/heads/main\n"),
3055        )
3056        .unwrap();
3057        let info = detect_git_for_run(dir.path());
3058        assert_eq!(info.commit_long.as_deref(), Some(sha));
3059        assert_eq!(info.branch.as_deref(), Some("main"));
3060    }
3061
3062    // ── ci_branch_from_env ───────────────────────────────────────────────────
3063
3064    // Note: ci_branch_from_env env-var tests share a mutex to avoid parallel interference.
3065    use std::sync::{Mutex, OnceLock};
3066    static CI_ENV_LOCK: OnceLock<Mutex<()>> = OnceLock::new();
3067    fn ci_env_lock() -> std::sync::MutexGuard<'static, ()> {
3068        CI_ENV_LOCK.get_or_init(|| Mutex::new(())).lock().unwrap()
3069    }
3070
3071    fn clear_branch_env_vars() {
3072        for v in &[
3073            "BRANCH_NAME",
3074            "GIT_BRANCH",
3075            "GITHUB_REF_NAME",
3076            "CI_COMMIT_BRANCH",
3077            "CIRCLE_BRANCH",
3078            "TRAVIS_BRANCH",
3079            "BUILD_SOURCEBRANCH",
3080        ] {
3081            std::env::remove_var(v);
3082        }
3083    }
3084
3085    #[test]
3086    fn ci_branch_from_env_strips_refs_heads_prefix() {
3087        let _lock = ci_env_lock();
3088        clear_branch_env_vars();
3089        // Azure DevOps sets BUILD_SOURCEBRANCH = "refs/heads/main"
3090        std::env::set_var("BUILD_SOURCEBRANCH", "refs/heads/my-branch");
3091        let branch = ci_branch_from_env();
3092        clear_branch_env_vars();
3093        assert_eq!(branch.as_deref(), Some("my-branch"));
3094    }
3095
3096    #[test]
3097    fn ci_branch_from_env_strips_origin_prefix() {
3098        let _lock = ci_env_lock();
3099        clear_branch_env_vars();
3100        std::env::set_var("GIT_BRANCH", "origin/develop");
3101        let branch = ci_branch_from_env();
3102        clear_branch_env_vars();
3103        assert_eq!(branch.as_deref(), Some("develop"));
3104    }
3105
3106    #[test]
3107    fn ci_branch_from_env_returns_none_for_head() {
3108        let _lock = ci_env_lock();
3109        clear_branch_env_vars();
3110        // "HEAD" is filtered out; with no other vars, should return None
3111        std::env::set_var("BRANCH_NAME", "HEAD");
3112        let branch = ci_branch_from_env();
3113        clear_branch_env_vars();
3114        // HEAD value is filtered → None (or falls through to other vars, but all cleared)
3115        assert!(branch.is_none(), "HEAD should be filtered, got: {branch:?}");
3116    }
3117}
sloc_core/lib.rs

sloc_core/
lib.rs