Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3#![allow(clippy::multiple_crate_versions)]
4
5pub mod delta;
6pub mod history;
7pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
8pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot};
9
10use std::collections::{BTreeMap, BTreeSet, HashSet};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14use anyhow::{Context, Result};
15use chrono::{DateTime, Utc};
16use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use ignore::WalkBuilder;
19use serde::{Deserialize, Serialize};
20use uuid::Uuid;
21
22use sloc_config::{
23    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
24    FailureBehavior, MixedLinePolicy,
25};
26use sloc_languages::{
27    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
28    RawLineCounts,
29};
30
31/// Three-way outcome for metadata-level policy checks.
32enum MetadataPolicyOutcome {
33    /// Skip this file โ€” include the record in output.
34    Skip(Box<FileRecord>),
35    /// Exclude this file entirely โ€” no record in output (include-glob miss).
36    Exclude,
37    /// Continue to content checks.
38    Continue,
39}
40
41#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
42#[serde(rename_all = "snake_case")]
43pub enum FileStatus {
44    AnalyzedExact,
45    AnalyzedBestEffort,
46    SkippedBinary,
47    SkippedDecodeError,
48    SkippedUnsupported,
49    SkippedByPolicy,
50    ErrorInternal,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize, Default)]
54pub struct EffectiveCounts {
55    pub code_lines: u64,
56    pub comment_lines: u64,
57    pub blank_lines: u64,
58    pub mixed_lines_separate: u64,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ToolMetadata {
63    pub name: String,
64    pub version: String,
65    pub run_id: String,
66    pub timestamp_utc: DateTime<Utc>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct EnvironmentMetadata {
71    pub operating_system: String,
72    pub architecture: String,
73    pub runtime_mode: String,
74    pub initiator_username: String,
75    pub initiator_hostname: String,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, Default)]
79pub struct SummaryTotals {
80    pub files_considered: u64,
81    pub files_analyzed: u64,
82    pub files_skipped: u64,
83    pub total_physical_lines: u64,
84    pub code_lines: u64,
85    pub comment_lines: u64,
86    pub blank_lines: u64,
87    pub mixed_lines_separate: u64,
88    #[serde(default)]
89    pub functions: u64,
90    #[serde(default)]
91    pub classes: u64,
92    #[serde(default)]
93    pub variables: u64,
94    #[serde(default)]
95    pub imports: u64,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct LanguageSummary {
100    pub language: Language,
101    pub files: u64,
102    pub total_physical_lines: u64,
103    pub code_lines: u64,
104    pub comment_lines: u64,
105    pub blank_lines: u64,
106    pub mixed_lines_separate: u64,
107    #[serde(default)]
108    pub functions: u64,
109    #[serde(default)]
110    pub classes: u64,
111    #[serde(default)]
112    pub variables: u64,
113    #[serde(default)]
114    pub imports: u64,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct FileRecord {
119    pub path: String,
120    pub relative_path: String,
121    pub language: Option<Language>,
122    pub size_bytes: u64,
123    pub detected_encoding: Option<String>,
124    pub raw_line_categories: RawLineCounts,
125    pub effective_counts: EffectiveCounts,
126    pub status: FileStatus,
127    pub warnings: Vec<String>,
128    pub generated: bool,
129    pub minified: bool,
130    pub vendor: bool,
131    pub parse_mode: Option<ParseMode>,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub submodule: Option<String>,
134}
135
136/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct SubmoduleSummary {
139    pub name: String,
140    pub relative_path: String,
141    pub files_analyzed: u64,
142    pub total_physical_lines: u64,
143    pub code_lines: u64,
144    pub comment_lines: u64,
145    pub blank_lines: u64,
146    pub language_summaries: Vec<LanguageSummary>,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct AnalysisRun {
151    pub tool: ToolMetadata,
152    pub environment: EnvironmentMetadata,
153    pub effective_configuration: AppConfig,
154    pub input_roots: Vec<String>,
155    pub summary_totals: SummaryTotals,
156    pub totals_by_language: Vec<LanguageSummary>,
157    pub per_file_records: Vec<FileRecord>,
158    pub skipped_file_records: Vec<FileRecord>,
159    pub warnings: Vec<String>,
160    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
161    #[serde(default, skip_serializing_if = "Vec::is_empty")]
162    pub submodule_summaries: Vec<SubmoduleSummary>,
163    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
164    #[serde(default, skip_serializing_if = "Option::is_none")]
165    pub git_commit_short: Option<String>,
166    /// Full git commit SHA at scan time, if the project is a git repo.
167    #[serde(default, skip_serializing_if = "Option::is_none")]
168    pub git_commit_long: Option<String>,
169    /// Git branch active at scan time, if the project is a git repo.
170    #[serde(default, skip_serializing_if = "Option::is_none")]
171    pub git_branch: Option<String>,
172    /// Author of the last git commit at scan time.
173    #[serde(default, skip_serializing_if = "Option::is_none")]
174    pub git_commit_author: Option<String>,
175    /// Comma-separated git tags pointing at HEAD at scan time.
176    #[serde(default, skip_serializing_if = "Option::is_none")]
177    pub git_tags: Option<String>,
178    /// ISO 8601 author-date of the last git commit at scan time.
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub git_commit_date: Option<String>,
181}
182
183fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
184    std::process::Command::new("git")
185        .args(args)
186        .current_dir(dir)
187        .output()
188        .ok()
189        .filter(|o| o.status.success())
190        .and_then(|o| String::from_utf8(o.stdout).ok())
191        .map(|s| s.trim().to_string())
192        .filter(|s| !s.is_empty())
193}
194
195#[derive(Default)]
196struct GitInfo {
197    commit_short: Option<String>,
198    commit_long: Option<String>,
199    branch: Option<String>,
200    author: Option<String>,
201    tags: Option<String>,
202    commit_date: Option<String>,
203}
204
205fn detect_git_for_run(project_path: &Path) -> GitInfo {
206    GitInfo {
207        commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
208        commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
209        branch: run_git_in(project_path, &["branch", "--show-current"]),
210        author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
211        tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
212            t.lines()
213                .filter(|l| !l.is_empty())
214                .collect::<Vec<_>>()
215                .join(", ")
216        }),
217        commit_date: run_git_in(project_path, &["log", "--format=%aI", "-1"]),
218    }
219}
220
221fn get_current_username() -> String {
222    std::env::var("USERNAME")
223        .or_else(|_| std::env::var("USER"))
224        .unwrap_or_else(|_| "unknown".to_string())
225}
226
227fn get_hostname() -> String {
228    std::env::var("COMPUTERNAME")
229        .or_else(|_| std::env::var("HOSTNAME"))
230        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
231        .unwrap_or_else(|_| "unknown".to_string())
232}
233
234/// Walk a single directory root and collect file records into the output vectors.
235#[allow(clippy::too_many_arguments)]
236fn walk_root(
237    root: &Path,
238    config: &AppConfig,
239    include_globs: Option<&GlobSet>,
240    exclude_globs: Option<&GlobSet>,
241    enabled_languages: Option<&BTreeSet<Language>>,
242    seen_paths: &mut HashSet<PathBuf>,
243    analyzed: &mut Vec<FileRecord>,
244    skipped: &mut Vec<FileRecord>,
245    warnings: &mut Vec<String>,
246) -> Result<()> {
247    let mut builder = WalkBuilder::new(root);
248    builder
249        .follow_links(config.discovery.follow_symlinks)
250        .hidden(config.discovery.ignore_hidden_files)
251        .ignore(config.discovery.honor_ignore_files)
252        .parents(config.discovery.honor_ignore_files)
253        .git_ignore(config.discovery.honor_ignore_files)
254        .git_global(config.discovery.honor_ignore_files)
255        .git_exclude(config.discovery.honor_ignore_files);
256
257    for entry in builder.build() {
258        let entry = match entry {
259            Ok(entry) => entry,
260            Err(err) => {
261                warnings.push(format!("discovery warning: {err}"));
262                continue;
263            }
264        };
265
266        let path = entry.into_path();
267        if path.is_dir() || !seen_paths.insert(path.clone()) {
268            continue;
269        }
270
271        if let Some(record) = analyze_candidate_file(
272            &path,
273            root,
274            config,
275            include_globs,
276            exclude_globs,
277            enabled_languages,
278        )? {
279            push_record(record, analyzed, skipped, warnings);
280        }
281    }
282
283    Ok(())
284}
285
286/// Label each analyzed file with its submodule and build per-submodule summaries.
287fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
288    let root = config.discovery.root_paths[0]
289        .canonicalize()
290        .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
291    let submodules = detect_submodules(&root);
292    if submodules.is_empty() {
293        return Vec::new();
294    }
295
296    for file in analyzed.iter_mut() {
297        for (name, sub_path) in &submodules {
298            let prefix = sub_path.to_string_lossy().replace('\\', "/");
299            let rel = &file.relative_path;
300            if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
301                file.submodule = Some(name.clone());
302                break;
303            }
304        }
305    }
306
307    build_submodule_summaries(analyzed, &submodules)
308}
309
310/// Assemble the final `AnalysisRun` from collected records and metadata.
311fn assemble_run(
312    config: &AppConfig,
313    runtime_mode: &str,
314    analyzed: Vec<FileRecord>,
315    skipped: Vec<FileRecord>,
316    warnings: Vec<String>,
317    submodule_summaries: Vec<SubmoduleSummary>,
318) -> AnalysisRun {
319    let summary = build_summary(&analyzed, &skipped);
320    let language_summaries = build_language_summaries(&analyzed);
321
322    let first_root = config
323        .discovery
324        .root_paths
325        .first()
326        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
327    let git = first_root
328        .as_deref()
329        .map(detect_git_for_run)
330        .unwrap_or_default();
331
332    let now = Utc::now();
333    let run_id = {
334        let uuid_suffix = Uuid::new_v4().simple().to_string();
335        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
336    };
337
338    AnalysisRun {
339        tool: ToolMetadata {
340            name: "sloc".into(),
341            version: env!("CARGO_PKG_VERSION").into(),
342            run_id,
343            timestamp_utc: now,
344        },
345        environment: EnvironmentMetadata {
346            operating_system: std::env::consts::OS.into(),
347            architecture: std::env::consts::ARCH.into(),
348            runtime_mode: runtime_mode.into(),
349            initiator_username: get_current_username(),
350            initiator_hostname: get_hostname(),
351        },
352        effective_configuration: config.clone(),
353        input_roots: config
354            .discovery
355            .root_paths
356            .iter()
357            .map(|p| path_to_string(p))
358            .collect(),
359        summary_totals: summary,
360        totals_by_language: language_summaries,
361        per_file_records: analyzed,
362        skipped_file_records: skipped,
363        warnings,
364        submodule_summaries,
365        git_commit_short: git.commit_short,
366        git_commit_long: git.commit_long,
367        git_branch: git.branch,
368        git_commit_author: git.author,
369        git_tags: git.tags,
370        git_commit_date: git.commit_date,
371    }
372}
373
374/// # Errors
375///
376/// Returns an error if the config is invalid, root paths cannot be walked, or any file
377/// analysis step fails in a way that cannot be recovered from.
378#[allow(clippy::too_many_lines)]
379pub fn analyze(config: &AppConfig, runtime_mode: &str) -> Result<AnalysisRun> {
380    config.validate()?;
381
382    if config.discovery.root_paths.is_empty() {
383        anyhow::bail!("no input paths were provided");
384    }
385
386    let include_globs = compile_globset(&config.discovery.include_globs)?;
387    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
388    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
389
390    let mut analyzed = Vec::new();
391    let mut skipped = Vec::new();
392    let mut warnings = Vec::new();
393    let mut seen_paths = HashSet::new();
394
395    for root in &config.discovery.root_paths {
396        let root = root.canonicalize().unwrap_or_else(|_| root.clone());
397
398        if root.is_file() {
399            if let Some(record) = analyze_candidate_file(
400                &root,
401                root.parent().unwrap_or_else(|| Path::new(".")),
402                config,
403                include_globs.as_ref(),
404                exclude_globs.as_ref(),
405                enabled_languages.as_ref(),
406            )? {
407                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
408            }
409            continue;
410        }
411
412        walk_root(
413            &root,
414            config,
415            include_globs.as_ref(),
416            exclude_globs.as_ref(),
417            enabled_languages.as_ref(),
418            &mut seen_paths,
419            &mut analyzed,
420            &mut skipped,
421            &mut warnings,
422        )?;
423    }
424
425    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
426    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
427
428    // Submodule detection: label each file with its submodule and build per-submodule summaries.
429    let submodule_summaries = if config.discovery.submodule_breakdown {
430        process_submodules(config, &mut analyzed)
431    } else {
432        Vec::new()
433    };
434
435    Ok(assemble_run(
436        config,
437        runtime_mode,
438        analyzed,
439        skipped,
440        warnings,
441        submodule_summaries,
442    ))
443}
444
445fn push_record(
446    record: FileRecord,
447    analyzed: &mut Vec<FileRecord>,
448    skipped: &mut Vec<FileRecord>,
449    warnings: &mut Vec<String>,
450) {
451    warnings.extend(
452        record
453            .warnings
454            .iter()
455            .map(|warning| format!("{}: {warning}", record.relative_path)),
456    );
457
458    match record.status {
459        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
460        _ => skipped.push(record),
461    }
462}
463
464/// Apply metadata-level policy checks (symlink, name, dir exclusion, size, globs, lockfile).
465/// Returns `Skip(record)` to skip, `Exclude` to omit from output entirely (include-glob miss),
466/// or `Continue` to proceed to content checks.
467#[allow(clippy::too_many_arguments)]
468fn check_metadata_policy(
469    path: &Path,
470    root: &Path,
471    relative_path: &str,
472    metadata: &fs::Metadata,
473    config: &AppConfig,
474    include_globs: Option<&GlobSet>,
475    exclude_globs: Option<&GlobSet>,
476) -> MetadataPolicyOutcome {
477    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
478        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
479            path,
480            root,
481            metadata.len(),
482            FileStatus::SkippedByPolicy,
483            vec!["symlink skipped by policy".into()],
484        )));
485    }
486
487    if file_name_eq(path, ".gitignore") {
488        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
489            path,
490            root,
491            metadata.len(),
492            FileStatus::SkippedByPolicy,
493            vec![".gitignore is always excluded".into()],
494        )));
495    }
496
497    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
498        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
499            path,
500            root,
501            metadata.len(),
502            FileStatus::SkippedByPolicy,
503            vec!["path matched excluded directory setting".into()],
504        )));
505    }
506
507    if metadata.len() > config.discovery.max_file_size_bytes {
508        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
509            path,
510            root,
511            metadata.len(),
512            FileStatus::SkippedByPolicy,
513            vec![format!(
514                "file exceeded max_file_size_bytes ({})",
515                config.discovery.max_file_size_bytes
516            )],
517        )));
518    }
519
520    if let Some(globs) = include_globs {
521        if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
522            return MetadataPolicyOutcome::Exclude;
523        }
524    }
525
526    if let Some(globs) = exclude_globs {
527        if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
528            return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
529                path,
530                root,
531                metadata.len(),
532                FileStatus::SkippedByPolicy,
533                vec!["path matched exclude glob".into()],
534            )));
535        }
536    }
537
538    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
539        return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
540            path,
541            root,
542            metadata.len(),
543            FileStatus::SkippedByPolicy,
544            vec!["lockfile skipped by default policy".into()],
545        )));
546    }
547
548    MetadataPolicyOutcome::Continue
549}
550
551/// Apply content-level policy checks (vendor, generated, minified, binary).
552/// Returns `(vendor, generated, minified, skip_record)` where `skip_record` is `Some` when
553/// the file should be skipped.
554fn check_content_policy(
555    path: &Path,
556    root: &Path,
557    size_bytes: u64,
558    bytes: &[u8],
559    config: &AppConfig,
560) -> (bool, bool, bool, Option<FileRecord>) {
561    let vendor = is_vendor_path(path);
562    if vendor && config.analysis.vendor_directory_detection {
563        return (
564            vendor,
565            false,
566            false,
567            Some(skipped_record(
568                path,
569                root,
570                size_bytes,
571                FileStatus::SkippedByPolicy,
572                vec!["vendor file skipped by policy".into()],
573            )),
574        );
575    }
576
577    let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
578    if generated {
579        return (
580            vendor,
581            generated,
582            false,
583            Some(skipped_record(
584                path,
585                root,
586                size_bytes,
587                FileStatus::SkippedByPolicy,
588                vec!["generated file skipped by policy".into()],
589            )),
590        );
591    }
592
593    let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
594    if minified {
595        return (
596            vendor,
597            generated,
598            minified,
599            Some(skipped_record(
600                path,
601                root,
602                size_bytes,
603                FileStatus::SkippedByPolicy,
604                vec!["minified file skipped by policy".into()],
605            )),
606        );
607    }
608
609    (vendor, generated, minified, None)
610}
611
612/// Decode file bytes to a UTF-8 string, handling binary detection and decode failures.
613fn decode_file_contents(
614    path: &Path,
615    root: &Path,
616    size_bytes: u64,
617    bytes: &[u8],
618    config: &AppConfig,
619) -> Result<Option<(String, String, Vec<String>)>> {
620    if is_binary(bytes) {
621        return match config.analysis.binary_file_behavior {
622            BinaryFileBehavior::Skip => Ok(None),
623            BinaryFileBehavior::Fail => {
624                anyhow::bail!("binary file encountered: {}", path.display())
625            }
626        };
627    }
628
629    match decode_bytes(bytes) {
630        Ok(result) => Ok(Some(result)),
631        Err(err) => match config.analysis.decode_failure_behavior {
632            FailureBehavior::WarnSkip => {
633                // Caller will handle the None as a SkippedDecodeError record.
634                // We use a sentinel: return Ok(None) but encode the error into a field.
635                // Instead, propagate as a skipped record via the caller.
636                let _ = (path, root, size_bytes); // suppress unused warnings
637                Err(anyhow::anyhow!("__decode_warn__: {err}"))
638            }
639            FailureBehavior::Fail => {
640                anyhow::bail!("decode failure for {}: {err}", path.display())
641            }
642        },
643    }
644}
645
646#[allow(clippy::too_many_lines)]
647fn analyze_candidate_file(
648    path: &Path,
649    root: &Path,
650    config: &AppConfig,
651    include_globs: Option<&GlobSet>,
652    exclude_globs: Option<&GlobSet>,
653    enabled_languages: Option<&BTreeSet<Language>>,
654) -> Result<Option<FileRecord>> {
655    let metadata = match fs::symlink_metadata(path) {
656        Ok(metadata) => metadata,
657        Err(err) => {
658            return Ok(Some(skipped_record(
659                path,
660                root,
661                0,
662                FileStatus::ErrorInternal,
663                vec![format!("failed to read metadata: {err}")],
664            )));
665        }
666    };
667
668    let relative_path = relative_path_string(path, root);
669
670    // Metadata-level policy checks.
671    match check_metadata_policy(
672        path,
673        root,
674        &relative_path,
675        &metadata,
676        config,
677        include_globs,
678        exclude_globs,
679    ) {
680        MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
681        MetadataPolicyOutcome::Exclude => return Ok(None),
682        MetadataPolicyOutcome::Continue => {}
683    }
684
685    let bytes = match fs::read(path) {
686        Ok(bytes) => bytes,
687        Err(err) => {
688            return Ok(Some(skipped_record(
689                path,
690                root,
691                metadata.len(),
692                FileStatus::ErrorInternal,
693                vec![format!("failed to read file: {err}")],
694            )));
695        }
696    };
697
698    // Content-level policy checks (vendor, generated, minified).
699    let (vendor, generated, minified, skip_record) =
700        check_content_policy(path, root, metadata.len(), &bytes, config);
701    if let Some(record) = skip_record {
702        return Ok(Some(record));
703    }
704
705    // Decode content, handling binary and decode failures.
706    let (text, encoding, decode_warnings) =
707        match decode_file_contents(path, root, metadata.len(), &bytes, config) {
708            Ok(Some(result)) => result,
709            Ok(None) => {
710                return Ok(Some(skipped_record(
711                    path,
712                    root,
713                    metadata.len(),
714                    FileStatus::SkippedBinary,
715                    vec!["binary file skipped by default".into()],
716                )));
717            }
718            Err(err) => {
719                let msg = err.to_string();
720                if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
721                    return Ok(Some(skipped_record(
722                        path,
723                        root,
724                        metadata.len(),
725                        FileStatus::SkippedDecodeError,
726                        vec![warn_msg.to_string()],
727                    )));
728                }
729                return Err(err);
730            }
731        };
732
733    let first_line = text.lines().next();
734    let language = detect_language(
735        path,
736        first_line,
737        &config.analysis.extension_overrides,
738        config.analysis.shebang_detection,
739    );
740
741    let Some(language) = language else {
742        return Ok(Some(skipped_record(
743            path,
744            root,
745            metadata.len(),
746            FileStatus::SkippedUnsupported,
747            vec!["unsupported or undetected language".into()],
748        )));
749    };
750
751    if let Some(enabled) = enabled_languages {
752        if !enabled.contains(&language) {
753            return Ok(Some(skipped_record(
754                path,
755                root,
756                metadata.len(),
757                FileStatus::SkippedByPolicy,
758                vec![format!(
759                    "language {} disabled by configuration",
760                    language.display_name()
761                )],
762            )));
763        }
764    }
765
766    let ieee_opts = AnalysisOptions {
767        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
768            == BlankInBlockCommentPolicy::CountAsComment,
769        collapse_continuation_lines: config.analysis.continuation_line_policy
770            == ContinuationLinePolicy::CollapseToLogical,
771    };
772    let analysis = analyze_text(language, &text, ieee_opts);
773    let effective_counts = compute_effective_counts(
774        &analysis.raw,
775        config.analysis.mixed_line_policy,
776        config.analysis.python_docstrings_as_comments,
777        config.analysis.count_compiler_directives,
778    );
779
780    let mut warnings = decode_warnings;
781    warnings.extend(analysis.warnings.clone());
782
783    Ok(Some(FileRecord {
784        path: path_to_string(path),
785        relative_path,
786        language: Some(language),
787        size_bytes: metadata.len(),
788        detected_encoding: Some(encoding),
789        raw_line_categories: analysis.raw,
790        effective_counts,
791        status: match analysis.parse_mode {
792            ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
793            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
794        },
795        warnings,
796        generated,
797        minified,
798        vendor,
799        parse_mode: Some(analysis.parse_mode),
800        submodule: None,
801    }))
802}
803
804const fn compute_effective_counts(
805    raw: &RawLineCounts,
806    mixed_line_policy: MixedLinePolicy,
807    python_docstrings_as_comments: bool,
808    count_compiler_directives: bool,
809) -> EffectiveCounts {
810    let mut effective = EffectiveCounts {
811        code_lines: raw.code_only_lines,
812        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
813        blank_lines: raw.blank_only_lines,
814        mixed_lines_separate: 0,
815    };
816
817    if python_docstrings_as_comments {
818        effective.comment_lines += raw.docstring_comment_lines;
819    } else {
820        effective.code_lines += raw.docstring_comment_lines;
821    }
822
823    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
824    match mixed_line_policy {
825        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
826        MixedLinePolicy::CodeAndComment => {
827            effective.code_lines += mixed_total;
828            effective.comment_lines += mixed_total;
829        }
830        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
831        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
832    }
833
834    // IEEE 1045-1992 ยง4.2: optionally exclude preprocessor/compiler directives from code SLOC.
835    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
836    if !count_compiler_directives {
837        effective.code_lines = effective
838            .code_lines
839            .saturating_sub(raw.compiler_directive_lines);
840    }
841
842    effective
843}
844
845fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
846    let mut summary = SummaryTotals {
847        files_considered: (analyzed.len() + skipped.len()) as u64,
848        files_analyzed: analyzed.len() as u64,
849        files_skipped: skipped.len() as u64,
850        ..Default::default()
851    };
852
853    for record in analyzed {
854        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
855        summary.code_lines += record.effective_counts.code_lines;
856        summary.comment_lines += record.effective_counts.comment_lines;
857        summary.blank_lines += record.effective_counts.blank_lines;
858        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
859        summary.functions += record.raw_line_categories.functions;
860        summary.classes += record.raw_line_categories.classes;
861        summary.variables += record.raw_line_categories.variables;
862        summary.imports += record.raw_line_categories.imports;
863    }
864
865    summary
866}
867
868fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
869    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
870    for record in analyzed {
871        let Some(language) = record.language else {
872            continue;
873        };
874        let entry = by_language.entry(language).or_insert(LanguageSummary {
875            language,
876            files: 0,
877            total_physical_lines: 0,
878            code_lines: 0,
879            comment_lines: 0,
880            blank_lines: 0,
881            mixed_lines_separate: 0,
882            functions: 0,
883            classes: 0,
884            variables: 0,
885            imports: 0,
886        });
887        entry.files += 1;
888        entry.total_physical_lines += record.raw_line_categories.total_physical_lines;
889        entry.code_lines += record.effective_counts.code_lines;
890        entry.comment_lines += record.effective_counts.comment_lines;
891        entry.blank_lines += record.effective_counts.blank_lines;
892        entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
893        entry.functions += record.raw_line_categories.functions;
894        entry.classes += record.raw_line_categories.classes;
895        entry.variables += record.raw_line_categories.variables;
896        entry.imports += record.raw_line_categories.imports;
897    }
898
899    by_language.into_values().collect()
900}
901
902fn skipped_record(
903    path: &Path,
904    root: &Path,
905    size_bytes: u64,
906    status: FileStatus,
907    warnings: Vec<String>,
908) -> FileRecord {
909    FileRecord {
910        path: path_to_string(path),
911        relative_path: relative_path_string(path, root),
912        language: None,
913        size_bytes,
914        detected_encoding: None,
915        raw_line_categories: RawLineCounts::default(),
916        effective_counts: EffectiveCounts::default(),
917        status,
918        warnings,
919        generated: false,
920        minified: false,
921        vendor: false,
922        parse_mode: None,
923        submodule: None,
924    }
925}
926
927fn relative_path_string(path: &Path, root: &Path) -> String {
928    path.strip_prefix(root)
929        .unwrap_or(path)
930        .to_string_lossy()
931        .replace('\\', "/")
932}
933
934fn path_to_string(path: &Path) -> String {
935    path.to_string_lossy().replace('\\', "/")
936}
937
938/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
939#[must_use]
940pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
941    let gitmodules = root.join(".gitmodules");
942    if !gitmodules.is_file() {
943        return Vec::new();
944    }
945    let Ok(content) = fs::read_to_string(&gitmodules) else {
946        return Vec::new();
947    };
948
949    let mut result = Vec::new();
950    let mut current_name: Option<String> = None;
951    let mut current_path: Option<PathBuf> = None;
952
953    for line in content.lines() {
954        let trimmed = line.trim();
955        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
956            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
957                result.push((name, path));
958            }
959            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
960            current_name = Some(name);
961        } else if let Some(rest) = trimmed.strip_prefix("path") {
962            if let Some(eq_pos) = rest.find('=') {
963                let path_str = rest[eq_pos + 1..].trim();
964                current_path = Some(PathBuf::from(path_str));
965            }
966        }
967    }
968    if let (Some(name), Some(path)) = (current_name, current_path) {
969        result.push((name, path));
970    }
971
972    result
973}
974
975fn build_submodule_summaries(
976    analyzed: &[FileRecord],
977    submodules: &[(String, PathBuf)],
978) -> Vec<SubmoduleSummary> {
979    submodules
980        .iter()
981        .map(|(name, path)| {
982            let files: Vec<&FileRecord> = analyzed
983                .iter()
984                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
985                .collect();
986
987            let files_analyzed = files.len() as u64;
988            let total_physical_lines = files
989                .iter()
990                .map(|f| f.raw_line_categories.total_physical_lines)
991                .sum();
992            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
993            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
994            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
995            let language_summaries = build_language_summaries_from_slice(&files);
996
997            SubmoduleSummary {
998                name: name.clone(),
999                relative_path: path.to_string_lossy().replace('\\', "/"),
1000                files_analyzed,
1001                total_physical_lines,
1002                code_lines,
1003                comment_lines,
1004                blank_lines,
1005                language_summaries,
1006            }
1007        })
1008        .filter(|s| s.files_analyzed > 0)
1009        .collect()
1010}
1011
1012fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1013    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1014    for file in files {
1015        if let Some(lang) = file.language {
1016            let entry = map
1017                .entry(lang.display_name().to_string())
1018                .or_insert_with(|| LanguageSummary {
1019                    language: lang,
1020                    files: 0,
1021                    total_physical_lines: 0,
1022                    code_lines: 0,
1023                    comment_lines: 0,
1024                    blank_lines: 0,
1025                    mixed_lines_separate: 0,
1026                    functions: 0,
1027                    classes: 0,
1028                    variables: 0,
1029                    imports: 0,
1030                });
1031            entry.files += 1;
1032            let r = &file.raw_line_categories;
1033            entry.total_physical_lines += r.total_physical_lines;
1034            entry.code_lines += file.effective_counts.code_lines;
1035            entry.comment_lines += file.effective_counts.comment_lines;
1036            entry.blank_lines += file.effective_counts.blank_lines;
1037            entry.mixed_lines_separate += file.effective_counts.mixed_lines_separate;
1038        }
1039    }
1040    map.into_values().collect()
1041}
1042
1043fn file_name_eq(path: &Path, expected: &str) -> bool {
1044    path.file_name()
1045        .and_then(|name| name.to_str())
1046        .is_some_and(|name| name == expected)
1047}
1048
1049fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1050    path.components().any(|component| {
1051        component
1052            .as_os_str()
1053            .to_str()
1054            .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1055    })
1056}
1057
1058fn is_vendor_path(path: &Path) -> bool {
1059    path.components().any(|component| {
1060        component
1061            .as_os_str()
1062            .to_str()
1063            .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1064    })
1065}
1066
1067fn is_known_lockfile(path: &Path) -> bool {
1068    path.file_name()
1069        .and_then(|name| name.to_str())
1070        .is_some_and(|name| {
1071            matches!(
1072                name,
1073                "Cargo.lock"
1074                    | "package-lock.json"
1075                    | "yarn.lock"
1076                    | "pnpm-lock.yaml"
1077                    | "Pipfile.lock"
1078                    | "poetry.lock"
1079                    | "composer.lock"
1080            )
1081        })
1082}
1083
1084fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1085    let file_name = path
1086        .file_name()
1087        .and_then(|name| name.to_str())
1088        .unwrap_or_default();
1089    if file_name.contains(".generated.") || file_name.contains(".g.") {
1090        return true;
1091    }
1092
1093    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(1024)]).to_ascii_lowercase();
1094    sample.contains("@generated") || sample.contains("generated by")
1095}
1096
1097fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1098    let file_name = path
1099        .file_name()
1100        .and_then(|name| name.to_str())
1101        .unwrap_or_default();
1102    if file_name.contains(".min.") {
1103        return true;
1104    }
1105
1106    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(4096)]);
1107    let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1108    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1109    longest_line > 2000 && whitespace * 100 < sample.len().max(1)
1110}
1111
1112fn is_binary(bytes: &[u8]) -> bool {
1113    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1114        || bytes.starts_with(&[0xFF, 0xFE])
1115        || bytes.starts_with(&[0xFE, 0xFF])
1116    {
1117        return false;
1118    }
1119
1120    let sample = &bytes[..bytes.len().min(8192)];
1121    sample.contains(&0)
1122}
1123
1124fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1125    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1126        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1127        return Ok((text, "utf-8-bom".into(), vec![]));
1128    }
1129
1130    if bytes.starts_with(&[0xFF, 0xFE]) {
1131        let (cow, _, had_errors) = UTF_16LE.decode(&bytes[2..]);
1132        let mut warnings = Vec::new();
1133        if had_errors {
1134            warnings.push("utf-16le decode contained replacement characters".into());
1135        }
1136        return Ok((cow.into_owned(), "utf-16le".into(), warnings));
1137    }
1138
1139    if bytes.starts_with(&[0xFE, 0xFF]) {
1140        let (cow, _, had_errors) = UTF_16BE.decode(&bytes[2..]);
1141        let mut warnings = Vec::new();
1142        if had_errors {
1143            warnings.push("utf-16be decode contained replacement characters".into());
1144        }
1145        return Ok((cow.into_owned(), "utf-16be".into(), warnings));
1146    }
1147
1148    // Multiple statements in the else branch make map_or_else awkward here.
1149    #[allow(clippy::option_if_let_else)]
1150    if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1151        Ok((text, "utf-8".into(), vec![]))
1152    } else {
1153        let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1154        let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1155        if had_errors {
1156            warnings.push("fallback decode contained replacement characters".into());
1157        }
1158        Ok((cow.into_owned(), "windows-1252".into(), warnings))
1159    }
1160}
1161
1162fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1163    if patterns.is_empty() {
1164        return Ok(None);
1165    }
1166
1167    let mut builder = GlobSetBuilder::new();
1168    for pattern in patterns {
1169        builder
1170            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1171    }
1172    Ok(Some(
1173        builder.build().context("failed to compile glob filters")?,
1174    ))
1175}
1176
1177fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1178    if enabled.is_empty() {
1179        return Ok(None);
1180    }
1181
1182    let supported = supported_languages();
1183    let mut set = BTreeSet::new();
1184    for name in enabled {
1185        let language = Language::from_name(name)
1186            .with_context(|| format!("unsupported language in config: {name}"))?;
1187        if !supported.contains(&language) {
1188            anyhow::bail!("language {name} is not supported in this build");
1189        }
1190        set.insert(language);
1191    }
1192    Ok(Some(set))
1193}
1194
1195/// # Errors
1196///
1197/// Returns an error if serialization fails or the output file cannot be written.
1198pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1199    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1200    fs::write(output_path, json)
1201        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1202}
1203
1204/// # Errors
1205///
1206/// Returns an error if the file cannot be read or the JSON cannot be parsed.
1207pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1208    let contents = fs::read_to_string(path)
1209        .with_context(|| format!("failed to read result file {}", path.display()))?;
1210    serde_json::from_str(&contents)
1211        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1212}
1213
1214#[cfg(test)]
1215mod tests {
1216    use super::*;
1217
1218    #[test]
1219    fn effective_counts_respect_code_only_policy() {
1220        let raw = RawLineCounts {
1221            code_only_lines: 2,
1222            single_comment_only_lines: 1,
1223            mixed_code_single_comment_lines: 3,
1224            docstring_comment_lines: 2,
1225            ..RawLineCounts::default()
1226        };
1227        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1228        assert_eq!(counts.code_lines, 5);
1229        assert_eq!(counts.comment_lines, 3);
1230    }
1231
1232    #[test]
1233    fn effective_counts_can_separate_mixed() {
1234        let raw = RawLineCounts {
1235            mixed_code_single_comment_lines: 2,
1236            mixed_code_multi_comment_lines: 1,
1237            ..RawLineCounts::default()
1238        };
1239        let counts =
1240            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1241        assert_eq!(counts.mixed_lines_separate, 3);
1242        assert_eq!(counts.code_lines, 0);
1243        assert_eq!(counts.comment_lines, 0);
1244    }
1245
1246    #[test]
1247    fn windows_1252_fallback_decodes() {
1248        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1249        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1250        assert_eq!(encoding, "windows-1252");
1251        assert!(text.contains('โ€“'));
1252        assert!(!warnings.is_empty());
1253    }
1254}