sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot, WatchedDirsStore};
13
14use std::collections::{BTreeMap, BTreeSet, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18
19use anyhow::{Context, Result};
20use chrono::{DateTime, Utc};
21use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
22use globset::{Glob, GlobSet, GlobSetBuilder};
23use ignore::WalkBuilder;
24use serde::{Deserialize, Serialize};
25use uuid::Uuid;
26
27use sloc_config::{
28    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
29    FailureBehavior, MixedLinePolicy,
30};
31use sloc_languages::{
32    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
33    RawLineCounts,
34};
35
36// ── Detection sample sizes and thresholds ────────────────────────────────────
37
38/// Maximum number of worker threads used for parallel file analysis.
39const MAX_ANALYSIS_THREADS: usize = 16;
40/// Fallback thread count when `available_parallelism` is unavailable.
41const DEFAULT_ANALYSIS_THREADS: usize = 4;
42/// Byte sample used to detect `@generated` markers.
43const GENERATED_SAMPLE_BYTES: usize = 1024;
44/// Byte sample used to detect minified files via line-length heuristic.
45const MINIFIED_SAMPLE_BYTES: usize = 4096;
46/// Longest line length above which a file is considered minified.
47const MINIFIED_LINE_THRESHOLD: usize = 2000;
48/// Byte sample used to detect binary files via null-byte scan.
49const BINARY_SAMPLE_BYTES: usize = 8192;
50
51/// Three-way outcome for metadata-level policy checks.
52enum MetadataPolicyOutcome {
53    /// Skip this file — include the record in output.
54    Skip(Box<FileRecord>),
55    /// Exclude this file entirely — no record in output (include-glob miss).
56    Exclude,
57    /// Continue to content checks.
58    Continue,
59}
60
61#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum FileStatus {
64    AnalyzedExact,
65    AnalyzedBestEffort,
66    SkippedBinary,
67    SkippedDecodeError,
68    SkippedUnsupported,
69    SkippedByPolicy,
70    ErrorInternal,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, Default)]
74pub struct EffectiveCounts {
75    pub code_lines: u64,
76    pub comment_lines: u64,
77    pub blank_lines: u64,
78    pub mixed_lines_separate: u64,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ToolMetadata {
83    pub name: String,
84    pub version: String,
85    pub run_id: String,
86    pub timestamp_utc: DateTime<Utc>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct EnvironmentMetadata {
91    pub operating_system: String,
92    pub architecture: String,
93    pub runtime_mode: String,
94    pub initiator_username: String,
95    pub initiator_hostname: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, Default)]
99pub struct SummaryTotals {
100    pub files_considered: u64,
101    pub files_analyzed: u64,
102    pub files_skipped: u64,
103    pub total_physical_lines: u64,
104    pub code_lines: u64,
105    pub comment_lines: u64,
106    pub blank_lines: u64,
107    pub mixed_lines_separate: u64,
108    #[serde(default)]
109    pub functions: u64,
110    #[serde(default)]
111    pub classes: u64,
112    #[serde(default)]
113    pub variables: u64,
114    #[serde(default)]
115    pub imports: u64,
116    #[serde(default)]
117    pub test_count: u64,
118    /// Lexically detected test assertion call lines across all analyzed files.
119    #[serde(default)]
120    pub test_assertion_count: u64,
121    /// Lexically detected test suite / fixture / group declaration lines across all analyzed files.
122    #[serde(default)]
123    pub test_suite_count: u64,
124    /// Aggregated from LCOV data when provided.
125    #[serde(default)]
126    pub coverage_lines_found: u64,
127    #[serde(default)]
128    pub coverage_lines_hit: u64,
129    #[serde(default)]
130    pub coverage_functions_found: u64,
131    #[serde(default)]
132    pub coverage_functions_hit: u64,
133    #[serde(default)]
134    pub coverage_branches_found: u64,
135    #[serde(default)]
136    pub coverage_branches_hit: u64,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageSummary {
141    pub language: Language,
142    pub files: u64,
143    pub total_physical_lines: u64,
144    pub code_lines: u64,
145    pub comment_lines: u64,
146    pub blank_lines: u64,
147    pub mixed_lines_separate: u64,
148    #[serde(default)]
149    pub functions: u64,
150    #[serde(default)]
151    pub classes: u64,
152    #[serde(default)]
153    pub variables: u64,
154    #[serde(default)]
155    pub imports: u64,
156    #[serde(default)]
157    pub test_count: u64,
158    #[serde(default)]
159    pub test_assertion_count: u64,
160    #[serde(default)]
161    pub test_suite_count: u64,
162    #[serde(default)]
163    pub coverage_lines_found: u64,
164    #[serde(default)]
165    pub coverage_lines_hit: u64,
166    #[serde(default)]
167    pub coverage_functions_found: u64,
168    #[serde(default)]
169    pub coverage_functions_hit: u64,
170    #[serde(default)]
171    pub coverage_branches_found: u64,
172    #[serde(default)]
173    pub coverage_branches_hit: u64,
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct FileRecord {
178    pub path: String,
179    pub relative_path: String,
180    pub language: Option<Language>,
181    pub size_bytes: u64,
182    pub detected_encoding: Option<String>,
183    pub raw_line_categories: RawLineCounts,
184    pub effective_counts: EffectiveCounts,
185    pub status: FileStatus,
186    pub warnings: Vec<String>,
187    pub generated: bool,
188    pub minified: bool,
189    pub vendor: bool,
190    pub parse_mode: Option<ParseMode>,
191    #[serde(skip_serializing_if = "Option::is_none")]
192    pub submodule: Option<String>,
193    /// Line/function/branch coverage from an external LCOV file, when provided.
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub coverage: Option<FileCoverage>,
196}
197
198/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct SubmoduleSummary {
201    pub name: String,
202    pub relative_path: String,
203    pub files_analyzed: u64,
204    pub total_physical_lines: u64,
205    pub code_lines: u64,
206    pub comment_lines: u64,
207    pub blank_lines: u64,
208    pub language_summaries: Vec<LanguageSummary>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct AnalysisRun {
213    pub tool: ToolMetadata,
214    pub environment: EnvironmentMetadata,
215    pub effective_configuration: AppConfig,
216    pub input_roots: Vec<String>,
217    pub summary_totals: SummaryTotals,
218    pub totals_by_language: Vec<LanguageSummary>,
219    pub per_file_records: Vec<FileRecord>,
220    pub skipped_file_records: Vec<FileRecord>,
221    pub warnings: Vec<String>,
222    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
223    #[serde(default, skip_serializing_if = "Vec::is_empty")]
224    pub submodule_summaries: Vec<SubmoduleSummary>,
225    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub git_commit_short: Option<String>,
228    /// Full git commit SHA at scan time, if the project is a git repo.
229    #[serde(default, skip_serializing_if = "Option::is_none")]
230    pub git_commit_long: Option<String>,
231    /// Git branch active at scan time, if the project is a git repo.
232    #[serde(default, skip_serializing_if = "Option::is_none")]
233    pub git_branch: Option<String>,
234    /// Author of the last git commit at scan time.
235    #[serde(default, skip_serializing_if = "Option::is_none")]
236    pub git_commit_author: Option<String>,
237    /// Comma-separated git tags pointing at HEAD at scan time.
238    #[serde(default, skip_serializing_if = "Option::is_none")]
239    pub git_tags: Option<String>,
240    /// Nearest ancestor release tag (output of `git describe --tags --abbrev=0`).
241    #[serde(default, skip_serializing_if = "Option::is_none")]
242    pub git_nearest_tag: Option<String>,
243    /// ISO 8601 author-date of the last git commit at scan time.
244    #[serde(default, skip_serializing_if = "Option::is_none")]
245    pub git_commit_date: Option<String>,
246    /// URL of the `origin` remote as recorded in `.git/config` at scan time.
247    #[serde(default, skip_serializing_if = "Option::is_none")]
248    pub git_remote_url: Option<String>,
249}
250
251#[derive(Default)]
252struct GitInfo {
253    commit_short: Option<String>,
254    commit_long: Option<String>,
255    branch: Option<String>,
256    author: Option<String>,
257    tags: Option<String>,
258    nearest_tag: Option<String>,
259    commit_date: Option<String>,
260    remote_url: Option<String>,
261}
262
263/// Locate the `.git` directory by walking up from `start`.
264/// Handles plain repos, worktrees (`.git` is a file with `gitdir:` pointer), and
265/// submodules. Returns `None` if no git repo is found.
266fn find_git_dir(start: &Path) -> Option<PathBuf> {
267    let mut current = Some(start);
268    while let Some(dir) = current {
269        let candidate = dir.join(".git");
270        if candidate.is_dir() {
271            return Some(candidate);
272        }
273        if candidate.is_file() {
274            if let Some(resolved) = resolve_git_file_pointer(&candidate, dir) {
275                return Some(resolved);
276            }
277        }
278        current = dir.parent();
279    }
280    None
281}
282
283/// Resolve a `.git` *file* (worktree/submodule pointer) to the absolute path it
284/// points to. Returns `None` if the file is unreadable or lacks a `gitdir:` line,
285/// or if the resolved path is not an existing directory.
286fn resolve_git_file_pointer(file: &Path, base_dir: &Path) -> Option<PathBuf> {
287    let content = fs::read_to_string(file).ok()?;
288    let ptr = content.trim().strip_prefix("gitdir: ")?;
289    // Normalise forward-slash paths to the OS separator so that Path operations
290    // (join, exists, canonicalize) work correctly on Windows.
291    let ptr_native = ptr.replace('/', std::path::MAIN_SEPARATOR_STR);
292    let resolved = if Path::new(&ptr_native).is_absolute() {
293        PathBuf::from(&ptr_native)
294    } else {
295        base_dir.join(&ptr_native)
296    };
297    // canonicalize resolves ".." components and symlinks; fall back to the
298    // un-canonicalized path if it fails (e.g. some Windows configurations
299    // return a UNC "\\?\" prefix that confuses later path operations).
300    let final_path = resolved.canonicalize().unwrap_or(resolved);
301    if final_path.is_dir() {
302        Some(final_path)
303    } else {
304        None
305    }
306}
307
308/// Resolve a git ref name (e.g. `refs/heads/main`) to a full 40-char commit SHA.
309/// Checks loose ref files first, then `packed-refs`.
310fn resolve_ref(git_dir: &Path, refname: &str) -> Option<String> {
311    // Build the OS-native path to the loose ref file by joining each
312    // forward-slash component individually.  This produces the correct
313    // separator on every platform without any manual replacement.
314    let ref_path = refname
315        .split('/')
316        .fold(git_dir.to_path_buf(), |p, c| p.join(c));
317    if ref_path.exists() {
318        let sha = fs::read_to_string(&ref_path)
319            .ok()
320            .map(|s| s.trim().to_string())
321            .filter(|s| s.len() >= 40 && s.chars().all(|c| c.is_ascii_hexdigit()));
322        if sha.is_some() {
323            return sha;
324        }
325    }
326    // Packed refs: each line is "<sha> <refname>" (lines starting with '#' are
327    // comments; lines starting with '^' are peeled tag objects to skip).
328    // str::lines() handles both \n and \r\n, so Windows line endings are fine.
329    let packed = fs::read_to_string(git_dir.join("packed-refs")).ok()?;
330    for line in packed.lines() {
331        if line.starts_with('#') || line.starts_with('^') {
332            continue;
333        }
334        let mut cols = line.splitn(2, ' ');
335        let sha = cols.next()?;
336        let name = cols.next()?.trim();
337        if name == refname {
338            return Some(sha.to_string());
339        }
340    }
341    None
342}
343
344/// Parse the last entry of `.git/logs/HEAD` to get the commit author name and
345/// author-date in ISO 8601 format.
346///
347/// Reflog line format:
348/// `<old-sha> <new-sha> Author Name <email> <unix-ts> <tz-offset>\t<message>`
349fn parse_last_reflog_entry(git_dir: &Path) -> (Option<String>, Option<String>) {
350    let log_path = git_dir.join("logs").join("HEAD");
351    let Ok(content) = fs::read_to_string(&log_path) else {
352        return (None, None);
353    };
354    let Some(last) = content.lines().rfind(|l| !l.trim().is_empty()) else {
355        return (None, None);
356    };
357
358    // Skip the two 40-char SHAs + their separating spaces
359    // (an initial commit shows 0000... as old-sha, still 40 chars)
360    let Some(after_shas) = last.splitn(3, ' ').nth(2) else {
361        return (None, None);
362    };
363
364    // Author name ends just before " <email>"
365    let author = after_shas.find(" <").map(|i| after_shas[..i].to_string());
366
367    // Timestamp is the number after the closing ">"
368    let date = (|| {
369        use chrono::TimeZone as _;
370        let close = after_shas.find("> ")?;
371        let rest = after_shas[close + 2..].trim_start();
372        let mut tokens = rest.splitn(3, ' ');
373        let unix_str = tokens.next()?;
374        let offset_str = tokens.next().map(|s| s.split('\t').next().unwrap_or(s))?;
375        let ts: i64 = unix_str.parse().ok()?;
376        let dt = chrono::Utc.timestamp_opt(ts, 0).single()?;
377        // Format as ISO 8601 with timezone offset, e.g. 2026-05-17T12:51:54-07:00
378        let tz_display = if offset_str.len() == 5 {
379            format!("{}:{}", &offset_str[..3], &offset_str[3..])
380        } else {
381            offset_str.to_string()
382        };
383        Some(format!("{}{}", dt.format("%Y-%m-%dT%H:%M:%S"), tz_display))
384    })();
385
386    (author, date)
387}
388
389/// Parse `.git/config` and return the URL of the `origin` remote, if present.
390fn read_git_remote_url(git_dir: &Path) -> Option<String> {
391    let config = fs::read_to_string(git_dir.join("config")).ok()?;
392    let mut in_origin = false;
393    for line in config.lines() {
394        let trimmed = line.trim();
395        if trimmed.starts_with('[') {
396            in_origin = trimmed == r#"[remote "origin"]"#;
397        } else if in_origin {
398            if let Some(rest) = trimmed.strip_prefix("url") {
399                let rest = rest.trim_start_matches([' ', '\t']);
400                if let Some(url) = rest.strip_prefix('=') {
401                    let url = url.trim();
402                    if !url.is_empty() {
403                        return Some(url.to_owned());
404                    }
405                }
406            }
407        }
408    }
409    None
410}
411
412/// Detect git metadata by reading `.git/` files directly — no `git` executable
413/// needed. Falls back gracefully for detached HEADs, shallow clones, and missing
414/// reflogs.
415fn detect_git_for_run(project_path: &Path) -> GitInfo {
416    let Some(git_dir) = find_git_dir(project_path) else {
417        return GitInfo::default();
418    };
419
420    let head_raw = match fs::read_to_string(git_dir.join("HEAD")) {
421        Ok(s) => s.trim().to_string(),
422        Err(_) => return GitInfo::default(),
423    };
424
425    let (branch, commit_long) = head_raw.strip_prefix("ref: ").map_or_else(
426        || {
427            if head_raw.len() >= 40 && head_raw.chars().all(|c| c.is_ascii_hexdigit()) {
428                // Detached HEAD — the HEAD file itself is the commit SHA
429                (None, Some(head_raw[..40].to_string()))
430            } else {
431                (None, None)
432            }
433        },
434        |refname| {
435            let branch = refname
436                .strip_prefix("refs/heads/")
437                .map(|b| b.trim().to_string());
438            let sha = resolve_ref(&git_dir, refname.trim());
439            (branch, sha)
440        },
441    );
442
443    let commit_short = commit_long
444        .as_deref()
445        .map(|s| s.chars().take(7).collect::<String>());
446
447    let (author, commit_date) = parse_last_reflog_entry(&git_dir);
448    let remote_url = read_git_remote_url(&git_dir);
449
450    // Tags and nearest-tag still require git CLI — try it as a best-effort bonus
451    // but don't block on it. If git isn't available these will simply be None.
452    let tags = run_git_cmd(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
453        t.lines()
454            .filter(|l| !l.is_empty())
455            .collect::<Vec<_>>()
456            .join(", ")
457    });
458    let nearest_tag = run_git_cmd(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]);
459
460    GitInfo {
461        commit_short,
462        commit_long,
463        branch,
464        author,
465        tags,
466        nearest_tag,
467        commit_date,
468        remote_url,
469    }
470}
471
472/// Run a git command as a best-effort supplemental source.  Not used for the
473/// core commit/branch/author fields — those come from direct file reads above.
474fn run_git_cmd(dir: &Path, args: &[&str]) -> Option<String> {
475    // Try the bare name first (works when git is on PATH), then fall back to
476    // absolute paths for service accounts that run with a stripped PATH.
477    // Unix paths silently fail on Windows and vice-versa.
478    let candidates: &[&str] = &[
479        // Works on all platforms when git is on PATH
480        "git",
481        // Common Linux / macOS install locations
482        "/usr/bin/git",
483        "/usr/local/bin/git",
484        "/opt/homebrew/bin/git",
485        // Git for Windows default installation paths
486        r"C:\Program Files\Git\cmd\git.exe",
487        r"C:\Program Files\Git\bin\git.exe",
488        r"C:\Program Files (x86)\Git\cmd\git.exe",
489    ];
490    for &exe in candidates {
491        let result = std::process::Command::new(exe)
492            .args(["-c", "safe.directory=*"])
493            .args(args)
494            .current_dir(dir)
495            .output()
496            .ok()
497            .filter(|o| o.status.success())
498            .and_then(|o| String::from_utf8(o.stdout).ok())
499            .map(|s| s.trim().to_string())
500            .filter(|s| !s.is_empty());
501        if result.is_some() {
502            return result;
503        }
504    }
505    None
506}
507
508fn get_current_username() -> String {
509    std::env::var("USERNAME")
510        .or_else(|_| std::env::var("USER"))
511        .unwrap_or_else(|_| "unknown".to_string())
512}
513
514fn get_hostname() -> String {
515    std::env::var("COMPUTERNAME")
516        .or_else(|_| std::env::var("HOSTNAME"))
517        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
518        .unwrap_or_else(|_| "unknown".to_string())
519}
520
521/// Walk a single directory root and collect file records into the output vectors.
522#[allow(clippy::too_many_arguments)]
523fn walk_root(
524    root: &Path,
525    config: &AppConfig,
526    include_globs: Option<&GlobSet>,
527    exclude_globs: Option<&GlobSet>,
528    enabled_languages: Option<&BTreeSet<Language>>,
529    seen_paths: &mut HashSet<PathBuf>,
530    analyzed: &mut Vec<FileRecord>,
531    skipped: &mut Vec<FileRecord>,
532    warnings: &mut Vec<String>,
533    cancel: Option<&AtomicBool>,
534) -> Result<()> {
535    let mut builder = WalkBuilder::new(root);
536    builder
537        .follow_links(config.discovery.follow_symlinks)
538        .hidden(config.discovery.ignore_hidden_files)
539        .ignore(config.discovery.honor_ignore_files)
540        .parents(config.discovery.honor_ignore_files)
541        .git_ignore(config.discovery.honor_ignore_files)
542        .git_global(config.discovery.honor_ignore_files)
543        .git_exclude(config.discovery.honor_ignore_files);
544
545    let paths = collect_walk_paths(&builder, seen_paths, warnings);
546    if paths.is_empty() {
547        return Ok(());
548    }
549
550    let chunk_results = run_parallel_analysis(
551        &paths,
552        root,
553        config,
554        include_globs,
555        exclude_globs,
556        enabled_languages,
557        cancel,
558    )?;
559    merge_chunk_results(chunk_results, analyzed, skipped, warnings)
560}
561
562fn collect_walk_paths(
563    builder: &WalkBuilder,
564    seen_paths: &mut HashSet<PathBuf>,
565    warnings: &mut Vec<String>,
566) -> Vec<PathBuf> {
567    let mut paths = Vec::new();
568    for entry in builder.build() {
569        let entry = match entry {
570            Ok(e) => e,
571            Err(err) => {
572                warnings.push(format!("discovery warning: {err}"));
573                continue;
574            }
575        };
576        let path = entry.into_path();
577        if path.is_dir() || !seen_paths.insert(path.clone()) {
578            continue;
579        }
580        paths.push(path);
581    }
582    paths
583}
584
585#[allow(clippy::too_many_arguments)]
586fn run_parallel_analysis(
587    paths: &[PathBuf],
588    root: &Path,
589    config: &AppConfig,
590    include_globs: Option<&GlobSet>,
591    exclude_globs: Option<&GlobSet>,
592    enabled_languages: Option<&BTreeSet<Language>>,
593    cancel: Option<&AtomicBool>,
594) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
595    let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
596        n.get().min(MAX_ANALYSIS_THREADS)
597    });
598    let chunk_size = paths.len().div_ceil(thread_count);
599    std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
600        paths
601            .chunks(chunk_size)
602            .map(|chunk| {
603                s.spawn(move || -> Vec<Result<Option<FileRecord>>> {
604                    let mut results = Vec::with_capacity(chunk.len());
605                    for path in chunk {
606                        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
607                            results.push(Err(anyhow::anyhow!("analysis cancelled")));
608                            break;
609                        }
610                        results.push(analyze_candidate_file(
611                            path,
612                            root,
613                            config,
614                            include_globs,
615                            exclude_globs,
616                            enabled_languages,
617                        ));
618                    }
619                    results
620                })
621            })
622            .map(|h| {
623                h.join()
624                    .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
625            })
626            .collect()
627    })
628}
629
630fn merge_chunk_results(
631    chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
632    analyzed: &mut Vec<FileRecord>,
633    skipped: &mut Vec<FileRecord>,
634    warnings: &mut Vec<String>,
635) -> Result<()> {
636    for chunk in chunk_results {
637        for result in chunk {
638            if let Some(record) = result? {
639                push_record(record, analyzed, skipped, warnings);
640            }
641        }
642    }
643    Ok(())
644}
645
646/// Label each analyzed file with its submodule and build per-submodule summaries.
647fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
648    let root = config.discovery.root_paths[0]
649        .canonicalize()
650        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
651    let submodules = detect_submodules(&root);
652    if submodules.is_empty() {
653        return Vec::new();
654    }
655
656    for file in analyzed.iter_mut() {
657        for (name, sub_path) in &submodules {
658            let prefix = sub_path.to_string_lossy().replace('\\', "/");
659            let rel = &file.relative_path;
660            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
661                file.submodule = Some(name.clone());
662                break;
663            }
664        }
665    }
666
667    build_submodule_summaries(analyzed, &submodules)
668}
669
670/// Assemble the final `AnalysisRun` from collected records and metadata.
671fn assemble_run(
672    config: &AppConfig,
673    runtime_mode: &str,
674    analyzed: Vec<FileRecord>,
675    skipped: Vec<FileRecord>,
676    warnings: Vec<String>,
677    submodule_summaries: Vec<SubmoduleSummary>,
678) -> AnalysisRun {
679    let summary = build_summary(&analyzed, &skipped);
680    let language_summaries = build_language_summaries(&analyzed);
681
682    let first_root = config
683        .discovery
684        .root_paths
685        .first()
686        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
687    let git = first_root
688        .as_deref()
689        .map(detect_git_for_run)
690        .unwrap_or_default();
691
692    let now = Utc::now();
693    let run_id = {
694        let uuid_suffix = Uuid::new_v4().simple().to_string();
695        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
696    };
697
698    AnalysisRun {
699        tool: ToolMetadata {
700            name: "sloc".into(),
701            version: env!("CARGO_PKG_VERSION").into(),
702            run_id,
703            timestamp_utc: now,
704        },
705        environment: EnvironmentMetadata {
706            operating_system: std::env::consts::OS.into(),
707            architecture: std::env::consts::ARCH.into(),
708            runtime_mode: runtime_mode.into(),
709            initiator_username: get_current_username(),
710            initiator_hostname: get_hostname(),
711        },
712        effective_configuration: config.clone(),
713        input_roots: config
714            .discovery
715            .root_paths
716            .iter()
717            .map(|p| path_to_string(p))
718            .collect(),
719        summary_totals: summary,
720        totals_by_language: language_summaries,
721        per_file_records: analyzed,
722        skipped_file_records: skipped,
723        warnings,
724        submodule_summaries,
725        git_commit_short: git.commit_short,
726        git_commit_long: git.commit_long,
727        git_branch: git.branch,
728        git_commit_author: git.author,
729        git_tags: git.tags,
730        git_nearest_tag: git.nearest_tag,
731        git_commit_date: git.commit_date,
732        git_remote_url: git.remote_url,
733    }
734}
735
736/// # Errors
737///
738/// Returns an error if the config is invalid, root paths cannot be walked, or any file
739/// analysis step fails in a way that cannot be recovered from.
740#[allow(clippy::too_many_lines)]
741pub fn analyze(
742    config: &AppConfig,
743    runtime_mode: &str,
744    cancel: Option<&AtomicBool>,
745) -> Result<AnalysisRun> {
746    config.validate()?;
747
748    if config.discovery.root_paths.is_empty() {
749        anyhow::bail!("no input paths were provided");
750    }
751
752    let include_globs = compile_globset(&config.discovery.include_globs)?;
753    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
754    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
755
756    let mut analyzed = Vec::new();
757    let mut skipped = Vec::new();
758    let mut warnings = Vec::new();
759    let mut seen_paths = HashSet::new();
760
761    for root in &config.discovery.root_paths {
762        if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
763            anyhow::bail!("analysis cancelled");
764        }
765
766        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
767
768        if root.is_file() {
769            if let Some(record) = analyze_candidate_file(
770                &root,
771                root.parent().unwrap_or_else(|| Path::new(".")),
772                config,
773                include_globs.as_ref(),
774                exclude_globs.as_ref(),
775                enabled_languages.as_ref(),
776            )? {
777                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
778            }
779            continue;
780        }
781
782        walk_root(
783            &root,
784            config,
785            include_globs.as_ref(),
786            exclude_globs.as_ref(),
787            enabled_languages.as_ref(),
788            &mut seen_paths,
789            &mut analyzed,
790            &mut skipped,
791            &mut warnings,
792            cancel,
793        )?;
794    }
795
796    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
797    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
798
799    // Submodule detection: label each file with its submodule and build per-submodule summaries.
800    let submodule_summaries = if config.discovery.submodule_breakdown {
801        process_submodules(config, &mut analyzed)
802    } else {
803        Vec::new()
804    };
805
806    attach_coverage(config, &mut analyzed, &mut warnings);
807
808    Ok(assemble_run(
809        config,
810        runtime_mode,
811        analyzed,
812        skipped,
813        warnings,
814        submodule_summaries,
815    ))
816}
817
818fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
819    let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
820    else {
821        return;
822    };
823    tracing::debug!(path = %cov_path.display(), "loading coverage file");
824    match fs::read_to_string(&cov_path) {
825        Ok(content) => {
826            let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
827            let mut matched: u32 = 0;
828            let mut unmatched: u32 = 0;
829            for record in analyzed.iter_mut() {
830                record.coverage =
831                    coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
832                if record.coverage.is_some() {
833                    matched += 1;
834                } else {
835                    unmatched += 1;
836                }
837            }
838            tracing::debug!(
839                path = %cov_path.display(),
840                coverage_entries = cov_map.len(),
841                files_matched = matched,
842                files_unmatched = unmatched,
843                "coverage attached"
844            );
845            if unmatched > 0 && matched == 0 {
846                tracing::warn!(
847                    path = %cov_path.display(),
848                    "coverage file loaded but no source files could be matched — check that paths in the coverage report match the scanned directory"
849                );
850            }
851        }
852        Err(e) => {
853            tracing::warn!(path = %cov_path.display(), error = %e, "coverage file could not be read");
854            warnings.push(format!(
855                "coverage file '{}' could not be read: {e}",
856                cov_path.display()
857            ));
858        }
859    }
860}
861
862fn push_record(
863    record: FileRecord,
864    analyzed: &mut Vec<FileRecord>,
865    skipped: &mut Vec<FileRecord>,
866    warnings: &mut Vec<String>,
867) {
868    warnings.extend(
869        record
870            .warnings
871            .iter()
872            .map(|warning| format!("{}: {warning}", record.relative_path)),
873    );
874
875    match record.status {
876        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
877        _ => skipped.push(record),
878    }
879}
880
881/// Convenience wrapper: build a boxed `Skip` outcome with a single-item warning message.
882#[inline]
883fn skip_with_reason(
884    path: &Path,
885    root: &Path,
886    size: u64,
887    reason: impl Into<String>,
888) -> MetadataPolicyOutcome {
889    MetadataPolicyOutcome::Skip(Box::new(skipped_record(
890        path,
891        root,
892        size,
893        FileStatus::SkippedByPolicy,
894        vec![reason.into()],
895    )))
896}
897
898/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
899/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
900/// or `Continue` to proceed to content checks.
901#[allow(clippy::too_many_arguments)]
902fn check_metadata_policy(
903    path: &Path,
904    root: &Path,
905    relative_path: &str,
906    metadata: &fs::Metadata,
907    config: &AppConfig,
908    include_globs: Option<&GlobSet>,
909    exclude_globs: Option<&GlobSet>,
910) -> MetadataPolicyOutcome {
911    let size = metadata.len();
912
913    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
914        return skip_with_reason(path, root, size, "symlink skipped by policy");
915    }
916    if file_name_eq(path, ".gitignore") {
917        return skip_with_reason(path, root, size, ".gitignore is always excluded");
918    }
919    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
920        return skip_with_reason(path, root, size, "path matched excluded directory setting");
921    }
922    if size > config.discovery.max_file_size_bytes {
923        return skip_with_reason(
924            path,
925            root,
926            size,
927            format!(
928                "file exceeded max_file_size_bytes ({})",
929                config.discovery.max_file_size_bytes
930            ),
931        );
932    }
933    if let Some(globs) = include_globs {
934        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
935            return MetadataPolicyOutcome::Exclude;
936        }
937    }
938    if let Some(globs) = exclude_globs {
939        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
940            return skip_with_reason(path, root, size, "path matched exclude glob");
941        }
942    }
943    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
944        return skip_with_reason(path, root, size, "lockfile skipped by default policy");
945    }
946
947    MetadataPolicyOutcome::Continue
948}
949
950struct ContentPolicyResult {
951    vendor: bool,
952    generated: bool,
953    minified: bool,
954    skip_record: Option<FileRecord>,
955}
956
957/// Apply content-level policy checks (vendor, generated, minified).
958/// `skip_record` is `Some` when the file should be skipped.
959fn check_content_policy(
960    path: &Path,
961    root: &Path,
962    size_bytes: u64,
963    bytes: &[u8],
964    config: &AppConfig,
965) -> ContentPolicyResult {
966    let vendor = is_vendor_path(path);
967    if vendor && config.analysis.vendor_directory_detection {
968        return ContentPolicyResult {
969            vendor,
970            generated: false,
971            minified: false,
972            skip_record: Some(skipped_record(
973                path,
974                root,
975                size_bytes,
976                FileStatus::SkippedByPolicy,
977                vec!["vendor file skipped by policy".into()],
978            )),
979        };
980    }
981
982    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
983    if generated {
984        return ContentPolicyResult {
985            vendor,
986            generated,
987            minified: false,
988            skip_record: Some(skipped_record(
989                path,
990                root,
991                size_bytes,
992                FileStatus::SkippedByPolicy,
993                vec!["generated file skipped by policy".into()],
994            )),
995        };
996    }
997
998    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
999    if minified {
1000        return ContentPolicyResult {
1001            vendor,
1002            generated,
1003            minified,
1004            skip_record: Some(skipped_record(
1005                path,
1006                root,
1007                size_bytes,
1008                FileStatus::SkippedByPolicy,
1009                vec!["minified file skipped by policy".into()],
1010            )),
1011        };
1012    }
1013
1014    ContentPolicyResult {
1015        vendor,
1016        generated,
1017        minified,
1018        skip_record: None,
1019    }
1020}
1021
1022/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
1023fn decode_file_contents(
1024    path: &Path,
1025    root: &Path,
1026    size_bytes: u64,
1027    bytes: &[u8],
1028    config: &AppConfig,
1029) -> Result<Option<(String, String, Vec<String>)>> {
1030    if is_binary(bytes) {
1031        return match config.analysis.binary_file_behavior {
1032            BinaryFileBehavior::Skip => Ok(None),
1033            BinaryFileBehavior::Fail => {
1034                anyhow::bail!("binary file encountered: {}", path.display())
1035            }
1036        };
1037    }
1038
1039    match decode_bytes(bytes) {
1040        Ok(result) => Ok(Some(result)),
1041        Err(err) => match config.analysis.decode_failure_behavior {
1042            FailureBehavior::WarnSkip => {
1043                // Caller will handle the None as a SkippedDecodeError record.
1044                // We use a sentinel: return Ok(None) but encode the error into a field.
1045                // Instead, propagate as a skipped record via the caller.
1046                let _ = (path, root, size_bytes); // suppress unused warnings
1047                Err(anyhow::anyhow!("__decode_warn__: {err}"))
1048            }
1049            FailureBehavior::Fail => {
1050                anyhow::bail!("decode failure for {}: {err}", path.display())
1051            }
1052        },
1053    }
1054}
1055
1056#[allow(clippy::too_many_lines)]
1057fn analyze_candidate_file(
1058    path: &Path,
1059    root: &Path,
1060    config: &AppConfig,
1061    include_globs: Option<&GlobSet>,
1062    exclude_globs: Option<&GlobSet>,
1063    enabled_languages: Option<&BTreeSet<Language>>,
1064) -> Result<Option<FileRecord>> {
1065    let metadata = match fs::symlink_metadata(path) {
1066        Ok(metadata) => metadata,
1067        Err(err) => {
1068            return Ok(Some(skipped_record(
1069                path,
1070                root,
1071                0,
1072                FileStatus::ErrorInternal,
1073                vec![format!("failed to read metadata: {err}")],
1074            )));
1075        }
1076    };
1077
1078    let relative_path = relative_path_string(path, root);
1079
1080    // Metadata-level policy checks.
1081    match check_metadata_policy(
1082        path,
1083        root,
1084        &relative_path,
1085        &metadata,
1086        config,
1087        include_globs,
1088        exclude_globs,
1089    ) {
1090        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
1091        MetadataPolicyOutcome::Exclude => return Ok(None),
1092        MetadataPolicyOutcome::Continue => {}
1093    }
1094
1095    let bytes = match fs::read(path) {
1096        Ok(bytes) => bytes,
1097        Err(err) => {
1098            return Ok(Some(skipped_record(
1099                path,
1100                root,
1101                metadata.len(),
1102                FileStatus::ErrorInternal,
1103                vec![format!("failed to read file: {err}")],
1104            )));
1105        }
1106    };
1107
1108    // Content-level policy checks (vendor, generated, minified).
1109    let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
1110    if let Some(record) = content_policy.skip_record {
1111        return Ok(Some(record));
1112    }
1113    let (vendor, generated, minified) = (
1114        content_policy.vendor,
1115        content_policy.generated,
1116        content_policy.minified,
1117    );
1118
1119    // Decode content, handling binary and decode failures.
1120    let (text, encoding, decode_warnings) =
1121        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
1122            Ok(Some(result)) => result,
1123            Ok(None) => {
1124                return Ok(Some(skipped_record(
1125                    path,
1126                    root,
1127                    metadata.len(),
1128                    FileStatus::SkippedBinary,
1129                    vec!["binary file skipped by default".into()],
1130                )));
1131            }
1132            Err(err) => {
1133                let msg = err.to_string();
1134                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
1135                    return Ok(Some(skipped_record(
1136                        path,
1137                        root,
1138                        metadata.len(),
1139                        FileStatus::SkippedDecodeError,
1140                        vec![warn_msg.to_string()],
1141                    )));
1142                }
1143                return Err(err);
1144            }
1145        };
1146
1147    let first_line = text.lines().next();
1148    let language = detect_language(
1149        path,
1150        first_line,
1151        &config.analysis.extension_overrides,
1152        config.analysis.shebang_detection,
1153    );
1154
1155    let Some(language) = language else {
1156        return Ok(Some(skipped_record(
1157            path,
1158            root,
1159            metadata.len(),
1160            FileStatus::SkippedUnsupported,
1161            vec!["unsupported or undetected language".into()],
1162        )));
1163    };
1164
1165    if let Some(enabled) = enabled_languages {
1166        if !enabled.contains(&language) {
1167            return Ok(Some(skipped_record(
1168                path,
1169                root,
1170                metadata.len(),
1171                FileStatus::SkippedByPolicy,
1172                vec![format!(
1173                    "language {} disabled by configuration",
1174                    language.display_name()
1175                )],
1176            )));
1177        }
1178    }
1179
1180    let ieee_opts = AnalysisOptions {
1181        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
1182            == BlankInBlockCommentPolicy::CountAsComment,
1183        collapse_continuation_lines: config.analysis.continuation_line_policy
1184            == ContinuationLinePolicy::CollapseToLogical,
1185    };
1186    let analysis = analyze_text(language, &text, ieee_opts);
1187    let effective_counts = compute_effective_counts(
1188        &analysis.raw,
1189        config.analysis.mixed_line_policy,
1190        config.analysis.python_docstrings_as_comments,
1191        config.analysis.count_compiler_directives,
1192    );
1193
1194    let mut warnings = decode_warnings;
1195    warnings.extend(analysis.warnings.clone());
1196
1197    Ok(Some(FileRecord {
1198        path: path_to_string(path),
1199        relative_path,
1200        language: Some(language),
1201        size_bytes: metadata.len(),
1202        detected_encoding: Some(encoding),
1203        raw_line_categories: analysis.raw,
1204        effective_counts,
1205        status: match analysis.parse_mode {
1206            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
1207            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
1208        },
1209        warnings,
1210        generated,
1211        minified,
1212        vendor,
1213        parse_mode: Some(analysis.parse_mode),
1214        submodule: None,
1215        coverage: None,
1216    }))
1217}
1218
1219const fn compute_effective_counts(
1220    raw: &RawLineCounts,
1221    mixed_line_policy: MixedLinePolicy,
1222    python_docstrings_as_comments: bool,
1223    count_compiler_directives: bool,
1224) -> EffectiveCounts {
1225    let mut effective = EffectiveCounts {
1226        code_lines: raw.code_only_lines,
1227        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
1228        blank_lines: raw.blank_only_lines,
1229        mixed_lines_separate: 0,
1230    };
1231
1232    if python_docstrings_as_comments {
1233        effective.comment_lines += raw.docstring_comment_lines;
1234    } else {
1235        effective.code_lines += raw.docstring_comment_lines;
1236    }
1237
1238    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
1239    match mixed_line_policy {
1240        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
1241        MixedLinePolicy::CodeAndComment => {
1242            effective.code_lines += mixed_total;
1243            effective.comment_lines += mixed_total;
1244        }
1245        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1246        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1247    }
1248
1249    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
1250    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
1251    if !count_compiler_directives {
1252        effective.code_lines = effective
1253            .code_lines
1254            .saturating_sub(raw.compiler_directive_lines);
1255    }
1256
1257    effective
1258}
1259
1260fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1261    let mut summary = SummaryTotals {
1262        files_considered: (analyzed.len() + skipped.len()) as u64,
1263        files_analyzed: analyzed.len() as u64,
1264        files_skipped: skipped.len() as u64,
1265        ..Default::default()
1266    };
1267
1268    for record in analyzed {
1269        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1270        summary.code_lines += record.effective_counts.code_lines;
1271        summary.comment_lines += record.effective_counts.comment_lines;
1272        summary.blank_lines += record.effective_counts.blank_lines;
1273        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1274        summary.functions += record.raw_line_categories.functions;
1275        summary.classes += record.raw_line_categories.classes;
1276        summary.variables += record.raw_line_categories.variables;
1277        summary.imports += record.raw_line_categories.imports;
1278        summary.test_count += record.raw_line_categories.test_count;
1279        summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1280        summary.test_suite_count += record.raw_line_categories.test_suite_count;
1281        if let Some(cov) = &record.coverage {
1282            summary.coverage_lines_found += u64::from(cov.lines_found);
1283            summary.coverage_lines_hit += u64::from(cov.lines_hit);
1284            summary.coverage_functions_found += u64::from(cov.functions_found);
1285            summary.coverage_functions_hit += u64::from(cov.functions_hit);
1286            summary.coverage_branches_found += u64::from(cov.branches_found);
1287            summary.coverage_branches_hit += u64::from(cov.branches_hit);
1288        }
1289    }
1290
1291    summary
1292}
1293
1294/// Construct a zero-filled `LanguageSummary` for the given language.
1295const fn zeroed_summary(language: Language) -> LanguageSummary {
1296    LanguageSummary {
1297        language,
1298        files: 0,
1299        total_physical_lines: 0,
1300        code_lines: 0,
1301        comment_lines: 0,
1302        blank_lines: 0,
1303        mixed_lines_separate: 0,
1304        functions: 0,
1305        classes: 0,
1306        variables: 0,
1307        imports: 0,
1308        test_count: 0,
1309        test_assertion_count: 0,
1310        test_suite_count: 0,
1311        coverage_lines_found: 0,
1312        coverage_lines_hit: 0,
1313        coverage_functions_found: 0,
1314        coverage_functions_hit: 0,
1315        coverage_branches_found: 0,
1316        coverage_branches_hit: 0,
1317    }
1318}
1319
1320/// Accumulate all per-file counters from `record` into an existing `LanguageSummary`.
1321fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1322    entry.files += 1;
1323    let r = &record.raw_line_categories;
1324    entry.total_physical_lines += r.total_physical_lines;
1325    entry.code_lines += record.effective_counts.code_lines;
1326    entry.comment_lines += record.effective_counts.comment_lines;
1327    entry.blank_lines += record.effective_counts.blank_lines;
1328    entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1329    entry.functions += r.functions;
1330    entry.classes += r.classes;
1331    entry.variables += r.variables;
1332    entry.imports += r.imports;
1333    entry.test_count += r.test_count;
1334    entry.test_assertion_count += r.test_assertion_count;
1335    entry.test_suite_count += r.test_suite_count;
1336    if let Some(cov) = &record.coverage {
1337        entry.coverage_lines_found += u64::from(cov.lines_found);
1338        entry.coverage_lines_hit += u64::from(cov.lines_hit);
1339        entry.coverage_functions_found += u64::from(cov.functions_found);
1340        entry.coverage_functions_hit += u64::from(cov.functions_hit);
1341        entry.coverage_branches_found += u64::from(cov.branches_found);
1342        entry.coverage_branches_hit += u64::from(cov.branches_hit);
1343    }
1344}
1345
1346fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1347    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1348    for record in analyzed {
1349        let Some(language) = record.language else {
1350            continue;
1351        };
1352        let entry = by_language
1353            .entry(language)
1354            .or_insert_with(|| zeroed_summary(language));
1355        accumulate_record_into_summary(entry, record);
1356    }
1357    by_language.into_values().collect()
1358}
1359
1360fn skipped_record(
1361    path: &Path,
1362    root: &Path,
1363    size_bytes: u64,
1364    status: FileStatus,
1365    warnings: Vec<String>,
1366) -> FileRecord {
1367    FileRecord {
1368        path: path_to_string(path),
1369        relative_path: relative_path_string(path, root),
1370        language: None,
1371        size_bytes,
1372        detected_encoding: None,
1373        raw_line_categories: RawLineCounts::default(),
1374        effective_counts: EffectiveCounts::default(),
1375        status,
1376        warnings,
1377        generated: false,
1378        minified: false,
1379        vendor: false,
1380        parse_mode: None,
1381        submodule: None,
1382        coverage: None,
1383    }
1384}
1385
1386fn relative_path_string(path: &Path, root: &Path) -> String {
1387    path.strip_prefix(root)
1388        .unwrap_or(path)
1389        .to_string_lossy()
1390        .replace('\\', "/")
1391}
1392
1393fn path_to_string(path: &Path) -> String {
1394    path.to_string_lossy().replace('\\', "/")
1395}
1396
1397/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
1398#[must_use]
1399pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1400    let gitmodules = root.join(".gitmodules");
1401    if !gitmodules.is_file() {
1402        return Vec::new();
1403    }
1404    let Ok(content) = fs::read_to_string(&gitmodules) else {
1405        return Vec::new();
1406    };
1407
1408    let mut result = Vec::new();
1409    let mut current_name: Option<String> = None;
1410    let mut current_path: Option<PathBuf> = None;
1411
1412    for line in content.lines() {
1413        let trimmed = line.trim();
1414        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1415            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1416                result.push((name, path));
1417            }
1418            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1419            current_name = Some(name);
1420        } else if let Some(rest) = trimmed.strip_prefix("path") {
1421            if let Some(eq_pos) = rest.find('=') {
1422                let path_str = rest[eq_pos + 1..].trim();
1423                current_path = Some(PathBuf::from(path_str));
1424            }
1425        }
1426    }
1427    if let (Some(name), Some(path)) = (current_name, current_path) {
1428        result.push((name, path));
1429    }
1430
1431    result
1432}
1433
1434fn build_submodule_summaries(
1435    analyzed: &[FileRecord],
1436    submodules: &[(String, PathBuf)],
1437) -> Vec<SubmoduleSummary> {
1438    submodules
1439        .iter()
1440        .map(|(name, path)| {
1441            let files: Vec<&FileRecord> = analyzed
1442                .iter()
1443                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1444                .collect();
1445
1446            let files_analyzed = files.len() as u64;
1447            let total_physical_lines = files
1448                .iter()
1449                .map(|f| f.raw_line_categories.total_physical_lines)
1450                .sum();
1451            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1452            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1453            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1454            let language_summaries = build_language_summaries_from_slice(&files);
1455
1456            SubmoduleSummary {
1457                name: name.clone(),
1458                relative_path: path.to_string_lossy().replace('\\', "/"),
1459                files_analyzed,
1460                total_physical_lines,
1461                code_lines,
1462                comment_lines,
1463                blank_lines,
1464                language_summaries,
1465            }
1466        })
1467        .filter(|s| s.files_analyzed > 0)
1468        .collect()
1469}
1470
1471fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1472    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1473    for file in files {
1474        let Some(lang) = file.language else { continue };
1475        let entry = map
1476            .entry(lang.display_name().to_string())
1477            .or_insert_with(|| zeroed_summary(lang));
1478        accumulate_record_into_summary(entry, file);
1479    }
1480    map.into_values().collect()
1481}
1482
1483fn file_name_eq(path: &Path, expected: &str) -> bool {
1484    path.file_name()
1485        .and_then(|name| name.to_str())
1486        .is_some_and(|name| name == expected)
1487}
1488
1489fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1490    path.components().any(|component| {
1491        component
1492            .as_os_str()
1493            .to_str()
1494            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1495    })
1496}
1497
1498fn is_vendor_path(path: &Path) -> bool {
1499    path.components().any(|component| {
1500        component
1501            .as_os_str()
1502            .to_str()
1503            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1504    })
1505}
1506
1507fn is_known_lockfile(path: &Path) -> bool {
1508    path.file_name()
1509        .and_then(|name| name.to_str())
1510        .is_some_and(|name| {
1511            matches!(
1512                name,
1513                "Cargo.lock"
1514                    | "package-lock.json"
1515                    | "yarn.lock"
1516                    | "pnpm-lock.yaml"
1517                    | "Pipfile.lock"
1518                    | "poetry.lock"
1519                    | "composer.lock"
1520            )
1521        })
1522}
1523
1524fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1525    let file_name = path
1526        .file_name()
1527        .and_then(|name| name.to_str())
1528        .unwrap_or_default();
1529    if file_name.contains(".generated.") || file_name.contains(".g.") {
1530        return true;
1531    }
1532
1533    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1534        .to_ascii_lowercase();
1535    sample.contains("@generated") || sample.contains("generated by")
1536}
1537
1538fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1539    let file_name = path
1540        .file_name()
1541        .and_then(|name| name.to_str())
1542        .unwrap_or_default();
1543    if file_name.contains(".min.") {
1544        return true;
1545    }
1546
1547    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1548    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1549    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1550    longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1551}
1552
1553fn is_binary(bytes: &[u8]) -> bool {
1554    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1555        || bytes.starts_with(&[0xFF, 0xFE])
1556        || bytes.starts_with(&[0xFE, 0xFF])
1557    {
1558        return false;
1559    }
1560
1561    let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1562    sample.contains(&0)
1563}
1564
1565/// Decode a BOM-stripped UTF-16 byte slice using the given encoding.
1566/// Returns `(text, encoding_label, warnings)`.
1567fn decode_utf16_bom(
1568    bom_stripped: &[u8],
1569    encoding: &'static encoding_rs::Encoding,
1570    label: &str,
1571) -> (String, String, Vec<String>) {
1572    let (cow, _, had_errors) = encoding.decode(bom_stripped);
1573    let mut warnings = Vec::new();
1574    if had_errors {
1575        warnings.push(format!("{label} decode contained replacement characters"));
1576    }
1577    (cow.into_owned(), label.into(), warnings)
1578}
1579
1580fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1581    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1582        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1583        return Ok((text, "utf-8-bom".into(), vec![]));
1584    }
1585    if bytes.starts_with(&[0xFF, 0xFE]) {
1586        return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1587    }
1588    if bytes.starts_with(&[0xFE, 0xFF]) {
1589        return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1590    }
1591
1592    // Multiple statements in the else branch make map_or_else awkward here.
1593    #[allow(clippy::option_if_let_else)]
1594    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1595        Ok((text, "utf-8".into(), vec![]))
1596    } else {
1597        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1598        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1599        if had_errors {
1600            warnings.push("fallback decode contained replacement characters".into());
1601        }
1602        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1603    }
1604}
1605
1606fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1607    if patterns.is_empty() {
1608        return Ok(None);
1609    }
1610
1611    let mut builder = GlobSetBuilder::new();
1612    for pattern in patterns {
1613        builder
1614            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1615    }
1616    Ok(Some(
1617        builder.build().context("failed to compile glob filters")?,
1618    ))
1619}
1620
1621fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1622    if enabled.is_empty() {
1623        return Ok(None);
1624    }
1625
1626    let supported = supported_languages();
1627    let mut set = BTreeSet::new();
1628    for name in enabled {
1629        let language = Language::from_name(name)
1630            .with_context(|| format!("unsupported language in config: {name}"))?;
1631        if !supported.contains(&language) {
1632            anyhow::bail!("language {name} is not supported in this build");
1633        }
1634        set.insert(language);
1635    }
1636    Ok(Some(set))
1637}
1638
1639/// # Errors
1640///
1641/// Returns an error if serialization fails or the output file cannot be written.
1642pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1643    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1644    fs::write(output_path, json)
1645        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1646}
1647
1648/// # Errors
1649///
1650/// Returns an error if the file cannot be read or the JSON cannot be parsed.
1651pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1652    let contents = fs::read_to_string(path)
1653        .with_context(|| format!("failed to read result file {}", path.display()))?;
1654    serde_json::from_str(&contents)
1655        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1656}
1657
1658#[cfg(test)]
1659mod tests {
1660    use super::*;
1661
1662    #[test]
1663    fn effective_counts_respect_code_only_policy() {
1664        let raw = RawLineCounts {
1665            code_only_lines: 2,
1666            single_comment_only_lines: 1,
1667            mixed_code_single_comment_lines: 3,
1668            docstring_comment_lines: 2,
1669            ..RawLineCounts::default()
1670        };
1671        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1672        assert_eq!(counts.code_lines, 5);
1673        assert_eq!(counts.comment_lines, 3);
1674    }
1675
1676    #[test]
1677    fn effective_counts_can_separate_mixed() {
1678        let raw = RawLineCounts {
1679            mixed_code_single_comment_lines: 2,
1680            mixed_code_multi_comment_lines: 1,
1681            ..RawLineCounts::default()
1682        };
1683        let counts =
1684            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1685        assert_eq!(counts.mixed_lines_separate, 3);
1686        assert_eq!(counts.code_lines, 0);
1687        assert_eq!(counts.comment_lines, 0);
1688    }
1689
1690    #[test]
1691    fn windows_1252_fallback_decodes() {
1692        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1693        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1694        assert_eq!(encoding, "windows-1252");
1695        assert!(text.contains('–'));
1696        assert!(!warnings.is_empty());
1697    }
1698}
sloc_core/lib.rs

sloc_core/
lib.rs