Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod delta;
6pub mod history;
7pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
8pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot};
9
10use std::collections::{BTreeMap, BTreeSet, HashSet};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14use anyhow::{Context, Result};
15use chrono::{DateTime, Utc};
16use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use ignore::WalkBuilder;
19use serde::{Deserialize, Serialize};
20use uuid::Uuid;
21
22use sloc_config::{
23    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
24    FailureBehavior, MixedLinePolicy,
25};
26use sloc_languages::{
27    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
28    RawLineCounts,
29};
30
31/// Three-way outcome for metadata-level policy checks.
32enum MetadataPolicyOutcome {
33    /// Skip this file โ€” include the record in output.
34    Skip(Box<FileRecord>),
35    /// Exclude this file entirely โ€” no record in output (include-glob miss).
36    Exclude,
37    /// Continue to content checks.
38    Continue,
39}
40
41#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
42#[serde(rename_all = "snake_case")]
43pub enum FileStatus {
44    AnalyzedExact,
45    AnalyzedBestEffort,
46    SkippedBinary,
47    SkippedDecodeError,
48    SkippedUnsupported,
49    SkippedByPolicy,
50    ErrorInternal,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize, Default)]
54pub struct EffectiveCounts {
55    pub code_lines: u64,
56    pub comment_lines: u64,
57    pub blank_lines: u64,
58    pub mixed_lines_separate: u64,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ToolMetadata {
63    pub name: String,
64    pub version: String,
65    pub run_id: String,
66    pub timestamp_utc: DateTime<Utc>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct EnvironmentMetadata {
71    pub operating_system: String,
72    pub architecture: String,
73    pub runtime_mode: String,
74    pub initiator_username: String,
75    pub initiator_hostname: String,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, Default)]
79pub struct SummaryTotals {
80    pub files_considered: u64,
81    pub files_analyzed: u64,
82    pub files_skipped: u64,
83    pub total_physical_lines: u64,
84    pub code_lines: u64,
85    pub comment_lines: u64,
86    pub blank_lines: u64,
87    pub mixed_lines_separate: u64,
88    #[serde(default)]
89    pub functions: u64,
90    #[serde(default)]
91    pub classes: u64,
92    #[serde(default)]
93    pub variables: u64,
94    #[serde(default)]
95    pub imports: u64,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct LanguageSummary {
100    pub language: Language,
101    pub files: u64,
102    pub total_physical_lines: u64,
103    pub code_lines: u64,
104    pub comment_lines: u64,
105    pub blank_lines: u64,
106    pub mixed_lines_separate: u64,
107    #[serde(default)]
108    pub functions: u64,
109    #[serde(default)]
110    pub classes: u64,
111    #[serde(default)]
112    pub variables: u64,
113    #[serde(default)]
114    pub imports: u64,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct FileRecord {
119    pub path: String,
120    pub relative_path: String,
121    pub language: Option<Language>,
122    pub size_bytes: u64,
123    pub detected_encoding: Option<String>,
124    pub raw_line_categories: RawLineCounts,
125    pub effective_counts: EffectiveCounts,
126    pub status: FileStatus,
127    pub warnings: Vec<String>,
128    pub generated: bool,
129    pub minified: bool,
130    pub vendor: bool,
131    pub parse_mode: Option<ParseMode>,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub submodule: Option<String>,
134}
135
136/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct SubmoduleSummary {
139    pub name: String,
140    pub relative_path: String,
141    pub files_analyzed: u64,
142    pub total_physical_lines: u64,
143    pub code_lines: u64,
144    pub comment_lines: u64,
145    pub blank_lines: u64,
146    pub language_summaries: Vec<LanguageSummary>,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct AnalysisRun {
151    pub tool: ToolMetadata,
152    pub environment: EnvironmentMetadata,
153    pub effective_configuration: AppConfig,
154    pub input_roots: Vec<String>,
155    pub summary_totals: SummaryTotals,
156    pub totals_by_language: Vec<LanguageSummary>,
157    pub per_file_records: Vec<FileRecord>,
158    pub skipped_file_records: Vec<FileRecord>,
159    pub warnings: Vec<String>,
160    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
161    #[serde(default, skip_serializing_if = "Vec::is_empty")]
162    pub submodule_summaries: Vec<SubmoduleSummary>,
163    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
164    #[serde(default, skip_serializing_if = "Option::is_none")]
165    pub git_commit_short: Option<String>,
166    /// Full git commit SHA at scan time, if the project is a git repo.
167    #[serde(default, skip_serializing_if = "Option::is_none")]
168    pub git_commit_long: Option<String>,
169    /// Git branch active at scan time, if the project is a git repo.
170    #[serde(default, skip_serializing_if = "Option::is_none")]
171    pub git_branch: Option<String>,
172    /// Author of the last git commit at scan time.
173    #[serde(default, skip_serializing_if = "Option::is_none")]
174    pub git_commit_author: Option<String>,
175    /// Comma-separated git tags pointing at HEAD at scan time.
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub git_tags: Option<String>,
178}
179
180fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
181    std::process::Command::new("git")
182        .args(args)
183        .current_dir(dir)
184        .output()
185        .ok()
186        .filter(|o| o.status.success())
187        .and_then(|o| String::from_utf8(o.stdout).ok())
188        .map(|s| s.trim().to_string())
189        .filter(|s| !s.is_empty())
190}
191
192#[derive(Default)]
193struct GitInfo {
194    commit_short: Option<String>,
195    commit_long: Option<String>,
196    branch: Option<String>,
197    author: Option<String>,
198    tags: Option<String>,
199}
200
201fn detect_git_for_run(project_path: &Path) -> GitInfo {
202    GitInfo {
203        commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
204        commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
205        branch: run_git_in(project_path, &["branch", "--show-current"]),
206        author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
207        tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
208            t.lines()
209                .filter(|l| !l.is_empty())
210                .collect::<Vec<_>>()
211                .join(", ")
212        }),
213    }
214}
215
216fn get_current_username() -> String {
217    std::env::var("USERNAME")
218        .or_else(|_| std::env::var("USER"))
219        .unwrap_or_else(|_| "unknown".to_string())
220}
221
222fn get_hostname() -> String {
223    std::env::var("COMPUTERNAME")
224        .or_else(|_| std::env::var("HOSTNAME"))
225        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
226        .unwrap_or_else(|_| "unknown".to_string())
227}
228
229/// Walk a single directory root and collect file records into the output vectors.
230#[allow(clippy::too_many_arguments)]
231fn walk_root(
232    root: &Path,
233    config: &AppConfig,
234    include_globs: Option<&GlobSet>,
235    exclude_globs: Option<&GlobSet>,
236    enabled_languages: Option<&BTreeSet<Language>>,
237    seen_paths: &mut HashSet<PathBuf>,
238    analyzed: &mut Vec<FileRecord>,
239    skipped: &mut Vec<FileRecord>,
240    warnings: &mut Vec<String>,
241) -> Result<()> {
242    let mut builder = WalkBuilder::new(root);
243    builder
244        .follow_links(config.discovery.follow_symlinks)
245        .hidden(config.discovery.ignore_hidden_files)
246        .ignore(config.discovery.honor_ignore_files)
247        .parents(config.discovery.honor_ignore_files)
248        .git_ignore(config.discovery.honor_ignore_files)
249        .git_global(config.discovery.honor_ignore_files)
250        .git_exclude(config.discovery.honor_ignore_files);
251
252    for entry in builder.build() {
253        let entry = match entry {
254            Ok(entry) => entry,
255            Err(err) => {
256                warnings.push(format!("discovery warning: {err}"));
257                continue;
258            }
259        };
260
261        let path = entry.into_path();
262        if path.is_dir() || !seen_paths.insert(path.clone()) {
263            continue;
264        }
265
266        if let Some(record) = analyze_candidate_file(
267            &path,
268            root,
269            config,
270            include_globs,
271            exclude_globs,
272            enabled_languages,
273        )? {
274            push_record(record, analyzed, skipped, warnings);
275        }
276    }
277
278    Ok(())
279}
280
281/// Label each analyzed file with its submodule and build per-submodule summaries.
282fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
283    let root = config.discovery.root_paths[0]
284        .canonicalize()
285        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
286    let submodules = detect_submodules(&root);
287    if submodules.is_empty() {
288        return Vec::new();
289    }
290
291    for file in analyzed.iter_mut() {
292        for (name, sub_path) in &submodules {
293            let prefix = sub_path.to_string_lossy().replace('\\', "/");
294            let rel = &file.relative_path;
295            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
296                file.submodule = Some(name.clone());
297                break;
298            }
299        }
300    }
301
302    build_submodule_summaries(analyzed, &submodules)
303}
304
305/// Assemble the final `AnalysisRun` from collected records and metadata.
306fn assemble_run(
307    config: &AppConfig,
308    runtime_mode: &str,
309    analyzed: Vec<FileRecord>,
310    skipped: Vec<FileRecord>,
311    warnings: Vec<String>,
312    submodule_summaries: Vec<SubmoduleSummary>,
313) -> AnalysisRun {
314    let summary = build_summary(&analyzed, &skipped);
315    let language_summaries = build_language_summaries(&analyzed);
316
317    let first_root = config
318        .discovery
319        .root_paths
320        .first()
321        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
322    let git = first_root
323        .as_deref()
324        .map(detect_git_for_run)
325        .unwrap_or_default();
326
327    let now = Utc::now();
328    let run_id = {
329        let uuid_suffix = Uuid::new_v4().simple().to_string();
330        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
331    };
332
333    AnalysisRun {
334        tool: ToolMetadata {
335            name: "sloc".into(),
336            version: env!("CARGO_PKG_VERSION").into(),
337            run_id,
338            timestamp_utc: now,
339        },
340        environment: EnvironmentMetadata {
341            operating_system: std::env::consts::OS.into(),
342            architecture: std::env::consts::ARCH.into(),
343            runtime_mode: runtime_mode.into(),
344            initiator_username: get_current_username(),
345            initiator_hostname: get_hostname(),
346        },
347        effective_configuration: config.clone(),
348        input_roots: config
349            .discovery
350            .root_paths
351            .iter()
352            .map(|p| path_to_string(p))
353            .collect(),
354        summary_totals: summary,
355        totals_by_language: language_summaries,
356        per_file_records: analyzed,
357        skipped_file_records: skipped,
358        warnings,
359        submodule_summaries,
360        git_commit_short: git.commit_short,
361        git_commit_long: git.commit_long,
362        git_branch: git.branch,
363        git_commit_author: git.author,
364        git_tags: git.tags,
365    }
366}
367
368/// # Errors
369///
370/// Returns an error if the config is invalid, root paths cannot be walked, or any file
371/// analysis step fails in a way that cannot be recovered from.
372#[allow(clippy::too_many_lines)]
373pub fn analyze(config: &AppConfig, runtime_mode: &str) -> Result<AnalysisRun> {
374    config.validate()?;
375
376    if config.discovery.root_paths.is_empty() {
377        anyhow::bail!("no input paths were provided");
378    }
379
380    let include_globs = compile_globset(&config.discovery.include_globs)?;
381    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
382    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
383
384    let mut analyzed = Vec::new();
385    let mut skipped = Vec::new();
386    let mut warnings = Vec::new();
387    let mut seen_paths = HashSet::new();
388
389    for root in &config.discovery.root_paths {
390        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
391
392        if root.is_file() {
393            if let Some(record) = analyze_candidate_file(
394                &root,
395                root.parent().unwrap_or_else(|| Path::new(".")),
396                config,
397                include_globs.as_ref(),
398                exclude_globs.as_ref(),
399                enabled_languages.as_ref(),
400            )? {
401                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
402            }
403            continue;
404        }
405
406        walk_root(
407            &root,
408            config,
409            include_globs.as_ref(),
410            exclude_globs.as_ref(),
411            enabled_languages.as_ref(),
412            &mut seen_paths,
413            &mut analyzed,
414            &mut skipped,
415            &mut warnings,
416        )?;
417    }
418
419    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
420    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
421
422    // Submodule detection: label each file with its submodule and build per-submodule summaries.
423    let submodule_summaries = if config.discovery.submodule_breakdown {
424        process_submodules(config, &mut analyzed)
425    } else {
426        Vec::new()
427    };
428
429    Ok(assemble_run(
430        config,
431        runtime_mode,
432        analyzed,
433        skipped,
434        warnings,
435        submodule_summaries,
436    ))
437}
438
439fn push_record(
440    record: FileRecord,
441    analyzed: &mut Vec<FileRecord>,
442    skipped: &mut Vec<FileRecord>,
443    warnings: &mut Vec<String>,
444) {
445    warnings.extend(
446        record
447            .warnings
448            .iter()
449            .map(|warning| format!("{}: {warning}", record.relative_path)),
450    );
451
452    match record.status {
453        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
454        _ => skipped.push(record),
455    }
456}
457
458/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
459/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
460/// or `Continue` to proceed to content checks.
461#[allow(clippy::too_many_arguments)]
462fn check_metadata_policy(
463    path: &Path,
464    root: &Path,
465    relative_path: &str,
466    metadata: &fs::Metadata,
467    config: &AppConfig,
468    include_globs: Option<&GlobSet>,
469    exclude_globs: Option<&GlobSet>,
470) -> MetadataPolicyOutcome {
471    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
472        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
473            path,
474            root,
475            metadata.len(),
476            FileStatus::SkippedByPolicy,
477            vec!["symlink skipped by policy".into()],
478        )));
479    }
480
481    if file_name_eq(path, ".gitignore") {
482        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
483            path,
484            root,
485            metadata.len(),
486            FileStatus::SkippedByPolicy,
487            vec![".gitignore is always excluded".into()],
488        )));
489    }
490
491    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
492        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
493            path,
494            root,
495            metadata.len(),
496            FileStatus::SkippedByPolicy,
497            vec!["path matched excluded directory setting".into()],
498        )));
499    }
500
501    if metadata.len() > config.discovery.max_file_size_bytes {
502        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
503            path,
504            root,
505            metadata.len(),
506            FileStatus::SkippedByPolicy,
507            vec![format!(
508                "file exceeded max_file_size_bytes ({})",
509                config.discovery.max_file_size_bytes
510            )],
511        )));
512    }
513
514    if let Some(globs) = include_globs {
515        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
516            return MetadataPolicyOutcome::Exclude;
517        }
518    }
519
520    if let Some(globs) = exclude_globs {
521        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
522            return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
523                path,
524                root,
525                metadata.len(),
526                FileStatus::SkippedByPolicy,
527                vec!["path matched exclude glob".into()],
528            )));
529        }
530    }
531
532    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
533        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
534            path,
535            root,
536            metadata.len(),
537            FileStatus::SkippedByPolicy,
538            vec!["lockfile skipped by default policy".into()],
539        )));
540    }
541
542    MetadataPolicyOutcome::Continue
543}
544
545/// Apply content-level policy checks (vendor, generated, minified, binary).
546/// Returns `(vendor, generated, minified, skip_record)` where `skip_record` is `Some` when
547/// the file should be skipped.
548fn check_content_policy(
549    path: &Path,
550    root: &Path,
551    size_bytes: u64,
552    bytes: &[u8],
553    config: &AppConfig,
554) -> (bool, bool, bool, Option<FileRecord>) {
555    let vendor = is_vendor_path(path);
556    if vendor && config.analysis.vendor_directory_detection {
557        return (
558            vendor,
559            false,
560            false,
561            Some(skipped_record(
562                path,
563                root,
564                size_bytes,
565                FileStatus::SkippedByPolicy,
566                vec!["vendor file skipped by policy".into()],
567            )),
568        );
569    }
570
571    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
572    if generated {
573        return (
574            vendor,
575            generated,
576            false,
577            Some(skipped_record(
578                path,
579                root,
580                size_bytes,
581                FileStatus::SkippedByPolicy,
582                vec!["generated file skipped by policy".into()],
583            )),
584        );
585    }
586
587    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
588    if minified {
589        return (
590            vendor,
591            generated,
592            minified,
593            Some(skipped_record(
594                path,
595                root,
596                size_bytes,
597                FileStatus::SkippedByPolicy,
598                vec!["minified file skipped by policy".into()],
599            )),
600        );
601    }
602
603    (vendor, generated, minified, None)
604}
605
606/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
607fn decode_file_contents(
608    path: &Path,
609    root: &Path,
610    size_bytes: u64,
611    bytes: &[u8],
612    config: &AppConfig,
613) -> Result<Option<(String, String, Vec<String>)>> {
614    if is_binary(bytes) {
615        return match config.analysis.binary_file_behavior {
616            BinaryFileBehavior::Skip => Ok(None),
617            BinaryFileBehavior::Fail => {
618                anyhow::bail!("binary file encountered: {}", path.display())
619            }
620        };
621    }
622
623    match decode_bytes(bytes) {
624        Ok(result) => Ok(Some(result)),
625        Err(err) => match config.analysis.decode_failure_behavior {
626            FailureBehavior::WarnSkip => {
627                // Caller will handle the None as a SkippedDecodeError record.
628                // We use a sentinel: return Ok(None) but encode the error into a field.
629                // Instead, propagate as a skipped record via the caller.
630                let _ = (path, root, size_bytes); // suppress unused warnings
631                Err(anyhow::anyhow!("__decode_warn__: {err}"))
632            }
633            FailureBehavior::Fail => {
634                anyhow::bail!("decode failure for {}: {err}", path.display())
635            }
636        },
637    }
638}
639
640#[allow(clippy::too_many_lines)]
641fn analyze_candidate_file(
642    path: &Path,
643    root: &Path,
644    config: &AppConfig,
645    include_globs: Option<&GlobSet>,
646    exclude_globs: Option<&GlobSet>,
647    enabled_languages: Option<&BTreeSet<Language>>,
648) -> Result<Option<FileRecord>> {
649    let metadata = match fs::symlink_metadata(path) {
650        Ok(metadata) => metadata,
651        Err(err) => {
652            return Ok(Some(skipped_record(
653                path,
654                root,
655                0,
656                FileStatus::ErrorInternal,
657                vec![format!("failed to read metadata: {err}")],
658            )));
659        }
660    };
661
662    let relative_path = relative_path_string(path, root);
663
664    // Metadata-level policy checks.
665    match check_metadata_policy(
666        path,
667        root,
668        &relative_path,
669        &metadata,
670        config,
671        include_globs,
672        exclude_globs,
673    ) {
674        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
675        MetadataPolicyOutcome::Exclude => return Ok(None),
676        MetadataPolicyOutcome::Continue => {}
677    }
678
679    let bytes = match fs::read(path) {
680        Ok(bytes) => bytes,
681        Err(err) => {
682            return Ok(Some(skipped_record(
683                path,
684                root,
685                metadata.len(),
686                FileStatus::ErrorInternal,
687                vec![format!("failed to read file: {err}")],
688            )));
689        }
690    };
691
692    // Content-level policy checks (vendor, generated, minified).
693    let (vendor, generated, minified, skip_record) =
694        check_content_policy(path, root, metadata.len(), &bytes, config);
695    if let Some(record) = skip_record {
696        return Ok(Some(record));
697    }
698
699    // Decode content, handling binary and decode failures.
700    let (text, encoding, decode_warnings) =
701        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
702            Ok(Some(result)) => result,
703            Ok(None) => {
704                return Ok(Some(skipped_record(
705                    path,
706                    root,
707                    metadata.len(),
708                    FileStatus::SkippedBinary,
709                    vec!["binary file skipped by default".into()],
710                )));
711            }
712            Err(err) => {
713                let msg = err.to_string();
714                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
715                    return Ok(Some(skipped_record(
716                        path,
717                        root,
718                        metadata.len(),
719                        FileStatus::SkippedDecodeError,
720                        vec![warn_msg.to_string()],
721                    )));
722                }
723                return Err(err);
724            }
725        };
726
727    let first_line = text.lines().next();
728    let language = detect_language(
729        path,
730        first_line,
731        &config.analysis.extension_overrides,
732        config.analysis.shebang_detection,
733    );
734
735    let Some(language) = language else {
736        return Ok(Some(skipped_record(
737            path,
738            root,
739            metadata.len(),
740            FileStatus::SkippedUnsupported,
741            vec!["unsupported or undetected language".into()],
742        )));
743    };
744
745    if let Some(enabled) = enabled_languages {
746        if !enabled.contains(&language) {
747            return Ok(Some(skipped_record(
748                path,
749                root,
750                metadata.len(),
751                FileStatus::SkippedByPolicy,
752                vec![format!(
753                    "language {} disabled by configuration",
754                    language.display_name()
755                )],
756            )));
757        }
758    }
759
760    let ieee_opts = AnalysisOptions {
761        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
762            == BlankInBlockCommentPolicy::CountAsComment,
763        collapse_continuation_lines: config.analysis.continuation_line_policy
764            == ContinuationLinePolicy::CollapseToLogical,
765    };
766    let analysis = analyze_text(language, &text, ieee_opts);
767    let effective_counts = compute_effective_counts(
768        &analysis.raw,
769        config.analysis.mixed_line_policy,
770        config.analysis.python_docstrings_as_comments,
771        config.analysis.count_compiler_directives,
772    );
773
774    let mut warnings = decode_warnings;
775    warnings.extend(analysis.warnings.clone());
776
777    Ok(Some(FileRecord {
778        path: path_to_string(path),
779        relative_path,
780        language: Some(language),
781        size_bytes: metadata.len(),
782        detected_encoding: Some(encoding),
783        raw_line_categories: analysis.raw,
784        effective_counts,
785        status: match analysis.parse_mode {
786            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
787            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
788        },
789        warnings,
790        generated,
791        minified,
792        vendor,
793        parse_mode: Some(analysis.parse_mode),
794        submodule: None,
795    }))
796}
797
798const fn compute_effective_counts(
799    raw: &RawLineCounts,
800    mixed_line_policy: MixedLinePolicy,
801    python_docstrings_as_comments: bool,
802    count_compiler_directives: bool,
803) -> EffectiveCounts {
804    let mut effective = EffectiveCounts {
805        code_lines: raw.code_only_lines,
806        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
807        blank_lines: raw.blank_only_lines,
808        mixed_lines_separate: 0,
809    };
810
811    if python_docstrings_as_comments {
812        effective.comment_lines += raw.docstring_comment_lines;
813    } else {
814        effective.code_lines += raw.docstring_comment_lines;
815    }
816
817    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
818    match mixed_line_policy {
819        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
820        MixedLinePolicy::CodeAndComment => {
821            effective.code_lines += mixed_total;
822            effective.comment_lines += mixed_total;
823        }
824        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
825        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
826    }
827
828    // IEEE 1045-1992 ยง4.2: optionally exclude preprocessor/compiler directives from code SLOC.
829    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
830    if !count_compiler_directives {
831        effective.code_lines = effective
832            .code_lines
833            .saturating_sub(raw.compiler_directive_lines);
834    }
835
836    effective
837}
838
839fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
840    let mut summary = SummaryTotals {
841        files_considered: (analyzed.len() + skipped.len()) as u64,
842        files_analyzed: analyzed.len() as u64,
843        files_skipped: skipped.len() as u64,
844        ..Default::default()
845    };
846
847    for record in analyzed {
848        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
849        summary.code_lines += record.effective_counts.code_lines;
850        summary.comment_lines += record.effective_counts.comment_lines;
851        summary.blank_lines += record.effective_counts.blank_lines;
852        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
853        summary.functions += record.raw_line_categories.functions;
854        summary.classes += record.raw_line_categories.classes;
855        summary.variables += record.raw_line_categories.variables;
856        summary.imports += record.raw_line_categories.imports;
857    }
858
859    summary
860}
861
862fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
863    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
864    for record in analyzed {
865        let Some(language) = record.language else {
866            continue;
867        };
868        let entry = by_language.entry(language).or_insert(LanguageSummary {
869            language,
870            files: 0,
871            total_physical_lines: 0,
872            code_lines: 0,
873            comment_lines: 0,
874            blank_lines: 0,
875            mixed_lines_separate: 0,
876            functions: 0,
877            classes: 0,
878            variables: 0,
879            imports: 0,
880        });
881        entry.files += 1;
882        entry.total_physical_lines += record.raw_line_categories.total_physical_lines;
883        entry.code_lines += record.effective_counts.code_lines;
884        entry.comment_lines += record.effective_counts.comment_lines;
885        entry.blank_lines += record.effective_counts.blank_lines;
886        entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
887        entry.functions += record.raw_line_categories.functions;
888        entry.classes += record.raw_line_categories.classes;
889        entry.variables += record.raw_line_categories.variables;
890        entry.imports += record.raw_line_categories.imports;
891    }
892
893    by_language.into_values().collect()
894}
895
896fn skipped_record(
897    path: &Path,
898    root: &Path,
899    size_bytes: u64,
900    status: FileStatus,
901    warnings: Vec<String>,
902) -> FileRecord {
903    FileRecord {
904        path: path_to_string(path),
905        relative_path: relative_path_string(path, root),
906        language: None,
907        size_bytes,
908        detected_encoding: None,
909        raw_line_categories: RawLineCounts::default(),
910        effective_counts: EffectiveCounts::default(),
911        status,
912        warnings,
913        generated: false,
914        minified: false,
915        vendor: false,
916        parse_mode: None,
917        submodule: None,
918    }
919}
920
921fn relative_path_string(path: &Path, root: &Path) -> String {
922    path.strip_prefix(root)
923        .unwrap_or(path)
924        .to_string_lossy()
925        .replace('\\', "/")
926}
927
928fn path_to_string(path: &Path) -> String {
929    path.to_string_lossy().replace('\\', "/")
930}
931
932/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
933#[must_use]
934pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
935    let gitmodules = root.join(".gitmodules");
936    if !gitmodules.is_file() {
937        return Vec::new();
938    }
939    let Ok(content) = fs::read_to_string(&gitmodules) else {
940        return Vec::new();
941    };
942
943    let mut result = Vec::new();
944    let mut current_name: Option<String> = None;
945    let mut current_path: Option<PathBuf> = None;
946
947    for line in content.lines() {
948        let trimmed = line.trim();
949        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
950            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
951                result.push((name, path));
952            }
953            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
954            current_name = Some(name);
955        } else if let Some(rest) = trimmed.strip_prefix("path") {
956            if let Some(eq_pos) = rest.find('=') {
957                let path_str = rest[eq_pos + 1..].trim();
958                current_path = Some(PathBuf::from(path_str));
959            }
960        }
961    }
962    if let (Some(name), Some(path)) = (current_name, current_path) {
963        result.push((name, path));
964    }
965
966    result
967}
968
969fn build_submodule_summaries(
970    analyzed: &[FileRecord],
971    submodules: &[(String, PathBuf)],
972) -> Vec<SubmoduleSummary> {
973    submodules
974        .iter()
975        .map(|(name, path)| {
976            let files: Vec<&FileRecord> = analyzed
977                .iter()
978                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
979                .collect();
980
981            let files_analyzed = files.len() as u64;
982            let total_physical_lines = files
983                .iter()
984                .map(|f| f.raw_line_categories.total_physical_lines)
985                .sum();
986            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
987            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
988            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
989            let language_summaries = build_language_summaries_from_slice(&files);
990
991            SubmoduleSummary {
992                name: name.clone(),
993                relative_path: path.to_string_lossy().replace('\\', "/"),
994                files_analyzed,
995                total_physical_lines,
996                code_lines,
997                comment_lines,
998                blank_lines,
999                language_summaries,
1000            }
1001        })
1002        .filter(|s| s.files_analyzed > 0)
1003        .collect()
1004}
1005
1006fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1007    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1008    for file in files {
1009        if let Some(lang) = file.language {
1010            let entry = map
1011                .entry(lang.display_name().to_string())
1012                .or_insert_with(|| LanguageSummary {
1013                    language: lang,
1014                    files: 0,
1015                    total_physical_lines: 0,
1016                    code_lines: 0,
1017                    comment_lines: 0,
1018                    blank_lines: 0,
1019                    mixed_lines_separate: 0,
1020                    functions: 0,
1021                    classes: 0,
1022                    variables: 0,
1023                    imports: 0,
1024                });
1025            entry.files += 1;
1026            let r = &file.raw_line_categories;
1027            entry.total_physical_lines += r.total_physical_lines;
1028            entry.code_lines += file.effective_counts.code_lines;
1029            entry.comment_lines += file.effective_counts.comment_lines;
1030            entry.blank_lines += file.effective_counts.blank_lines;
1031            entry.mixed_lines_separate += file.effective_counts.mixed_lines_separate;
1032        }
1033    }
1034    map.into_values().collect()
1035}
1036
1037fn file_name_eq(path: &Path, expected: &str) -> bool {
1038    path.file_name()
1039        .and_then(|name| name.to_str())
1040        .is_some_and(|name| name == expected)
1041}
1042
1043fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1044    path.components().any(|component| {
1045        component
1046            .as_os_str()
1047            .to_str()
1048            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1049    })
1050}
1051
1052fn is_vendor_path(path: &Path) -> bool {
1053    path.components().any(|component| {
1054        component
1055            .as_os_str()
1056            .to_str()
1057            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1058    })
1059}
1060
1061fn is_known_lockfile(path: &Path) -> bool {
1062    path.file_name()
1063        .and_then(|name| name.to_str())
1064        .is_some_and(|name| {
1065            matches!(
1066                name,
1067                "Cargo.lock"
1068                    | "package-lock.json"
1069                    | "yarn.lock"
1070                    | "pnpm-lock.yaml"
1071                    | "Pipfile.lock"
1072                    | "poetry.lock"
1073                    | "composer.lock"
1074            )
1075        })
1076}
1077
1078fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1079    let file_name = path
1080        .file_name()
1081        .and_then(|name| name.to_str())
1082        .unwrap_or_default();
1083    if file_name.contains(".generated.") || file_name.contains(".g.") {
1084        return true;
1085    }
1086
1087    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(1024)]).to_ascii_lowercase();
1088    sample.contains("@generated") || sample.contains("generated by")
1089}
1090
1091fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1092    let file_name = path
1093        .file_name()
1094        .and_then(|name| name.to_str())
1095        .unwrap_or_default();
1096    if file_name.contains(".min.") {
1097        return true;
1098    }
1099
1100    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(4096)]);
1101    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1102    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1103    longest_line > 2000 && whitespace * 100 < sample.len().max(1)
1104}
1105
1106fn is_binary(bytes: &[u8]) -> bool {
1107    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1108        || bytes.starts_with(&[0xFF, 0xFE])
1109        || bytes.starts_with(&[0xFE, 0xFF])
1110    {
1111        return false;
1112    }
1113
1114    let sample = &bytes[..bytes.len().min(8192)];
1115    sample.contains(&0)
1116}
1117
1118fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1119    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1120        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1121        return Ok((text, "utf-8-bom".into(), vec![]));
1122    }
1123
1124    if bytes.starts_with(&[0xFF, 0xFE]) {
1125        let (cow, _, had_errors) = UTF_16LE.decode(&bytes[2..]);
1126        let mut warnings = Vec::new();
1127        if had_errors {
1128            warnings.push("utf-16le decode contained replacement characters".into());
1129        }
1130        return Ok((cow.into_owned(), "utf-16le".into(), warnings));
1131    }
1132
1133    if bytes.starts_with(&[0xFE, 0xFF]) {
1134        let (cow, _, had_errors) = UTF_16BE.decode(&bytes[2..]);
1135        let mut warnings = Vec::new();
1136        if had_errors {
1137            warnings.push("utf-16be decode contained replacement characters".into());
1138        }
1139        return Ok((cow.into_owned(), "utf-16be".into(), warnings));
1140    }
1141
1142    // Multiple statements in the else branch make map_or_else awkward here.
1143    #[allow(clippy::option_if_let_else)]
1144    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1145        Ok((text, "utf-8".into(), vec![]))
1146    } else {
1147        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1148        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1149        if had_errors {
1150            warnings.push("fallback decode contained replacement characters".into());
1151        }
1152        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1153    }
1154}
1155
1156fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1157    if patterns.is_empty() {
1158        return Ok(None);
1159    }
1160
1161    let mut builder = GlobSetBuilder::new();
1162    for pattern in patterns {
1163        builder
1164            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1165    }
1166    Ok(Some(
1167        builder.build().context("failed to compile glob filters")?,
1168    ))
1169}
1170
1171fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1172    if enabled.is_empty() {
1173        return Ok(None);
1174    }
1175
1176    let supported = supported_languages();
1177    let mut set = BTreeSet::new();
1178    for name in enabled {
1179        let language = Language::from_name(name)
1180            .with_context(|| format!("unsupported language in config: {name}"))?;
1181        if !supported.contains(&language) {
1182            anyhow::bail!("language {name} is not supported in this build");
1183        }
1184        set.insert(language);
1185    }
1186    Ok(Some(set))
1187}
1188
1189/// # Errors
1190///
1191/// Returns an error if serialization fails or the output file cannot be written.
1192pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1193    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1194    fs::write(output_path, json)
1195        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1196}
1197
1198/// # Errors
1199///
1200/// Returns an error if the file cannot be read or the JSON cannot be parsed.
1201pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1202    let contents = fs::read_to_string(path)
1203        .with_context(|| format!("failed to read result file {}", path.display()))?;
1204    serde_json::from_str(&contents)
1205        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210    use super::*;
1211
1212    #[test]
1213    fn effective_counts_respect_code_only_policy() {
1214        let raw = RawLineCounts {
1215            code_only_lines: 2,
1216            single_comment_only_lines: 1,
1217            mixed_code_single_comment_lines: 3,
1218            docstring_comment_lines: 2,
1219            ..RawLineCounts::default()
1220        };
1221        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1222        assert_eq!(counts.code_lines, 5);
1223        assert_eq!(counts.comment_lines, 3);
1224    }
1225
1226    #[test]
1227    fn effective_counts_can_separate_mixed() {
1228        let raw = RawLineCounts {
1229            mixed_code_single_comment_lines: 2,
1230            mixed_code_multi_comment_lines: 1,
1231            ..RawLineCounts::default()
1232        };
1233        let counts =
1234            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1235        assert_eq!(counts.mixed_lines_separate, 3);
1236        assert_eq!(counts.code_lines, 0);
1237        assert_eq!(counts.comment_lines, 0);
1238    }
1239
1240    #[test]
1241    fn windows_1252_fallback_decodes() {
1242        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1243        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1244        assert_eq!(encoding, "windows-1252");
1245        assert!(text.contains('โ€“'));
1246        assert!(!warnings.is_empty());
1247    }
1248}