Skip to main content

sloc_core/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod delta;
5pub mod history;
6pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
7pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot};
8
9use std::collections::{BTreeMap, BTreeSet, HashSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use anyhow::{Context, Result};
14use chrono::{DateTime, Utc};
15use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
16use globset::{Glob, GlobSet, GlobSetBuilder};
17use ignore::WalkBuilder;
18use serde::{Deserialize, Serialize};
19use uuid::Uuid;
20
21use sloc_config::{
22    AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
23    FailureBehavior, MixedLinePolicy,
24};
25use sloc_languages::{
26    analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
27    RawLineCounts,
28};
29
30#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32pub enum FileStatus {
33    AnalyzedExact,
34    AnalyzedBestEffort,
35    SkippedBinary,
36    SkippedDecodeError,
37    SkippedUnsupported,
38    SkippedByPolicy,
39    ErrorInternal,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize, Default)]
43pub struct EffectiveCounts {
44    pub code_lines: u64,
45    pub comment_lines: u64,
46    pub blank_lines: u64,
47    pub mixed_lines_separate: u64,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ToolMetadata {
52    pub name: String,
53    pub version: String,
54    pub run_id: String,
55    pub timestamp_utc: DateTime<Utc>,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct EnvironmentMetadata {
60    pub operating_system: String,
61    pub architecture: String,
62    pub runtime_mode: String,
63    pub initiator_username: String,
64    pub initiator_hostname: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, Default)]
68pub struct SummaryTotals {
69    pub files_considered: u64,
70    pub files_analyzed: u64,
71    pub files_skipped: u64,
72    pub total_physical_lines: u64,
73    pub code_lines: u64,
74    pub comment_lines: u64,
75    pub blank_lines: u64,
76    pub mixed_lines_separate: u64,
77    #[serde(default)]
78    pub functions: u64,
79    #[serde(default)]
80    pub classes: u64,
81    #[serde(default)]
82    pub variables: u64,
83    #[serde(default)]
84    pub imports: u64,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct LanguageSummary {
89    pub language: Language,
90    pub files: u64,
91    pub total_physical_lines: u64,
92    pub code_lines: u64,
93    pub comment_lines: u64,
94    pub blank_lines: u64,
95    pub mixed_lines_separate: u64,
96    #[serde(default)]
97    pub functions: u64,
98    #[serde(default)]
99    pub classes: u64,
100    #[serde(default)]
101    pub variables: u64,
102    #[serde(default)]
103    pub imports: u64,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct FileRecord {
108    pub path: String,
109    pub relative_path: String,
110    pub language: Option<Language>,
111    pub size_bytes: u64,
112    pub detected_encoding: Option<String>,
113    pub raw_line_categories: RawLineCounts,
114    pub effective_counts: EffectiveCounts,
115    pub status: FileStatus,
116    pub warnings: Vec<String>,
117    pub generated: bool,
118    pub minified: bool,
119    pub vendor: bool,
120    pub parse_mode: Option<ParseMode>,
121    #[serde(skip_serializing_if = "Option::is_none")]
122    pub submodule: Option<String>,
123}
124
125/// Per-submodule aggregated stats produced when `submodule_breakdown` is enabled.
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct SubmoduleSummary {
128    pub name: String,
129    pub relative_path: String,
130    pub files_analyzed: u64,
131    pub total_physical_lines: u64,
132    pub code_lines: u64,
133    pub comment_lines: u64,
134    pub blank_lines: u64,
135    pub language_summaries: Vec<LanguageSummary>,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct AnalysisRun {
140    pub tool: ToolMetadata,
141    pub environment: EnvironmentMetadata,
142    pub effective_configuration: AppConfig,
143    pub input_roots: Vec<String>,
144    pub summary_totals: SummaryTotals,
145    pub totals_by_language: Vec<LanguageSummary>,
146    pub per_file_records: Vec<FileRecord>,
147    pub skipped_file_records: Vec<FileRecord>,
148    pub warnings: Vec<String>,
149    /// Non-empty only when `discovery.submodule_breakdown` is enabled.
150    #[serde(default, skip_serializing_if = "Vec::is_empty")]
151    pub submodule_summaries: Vec<SubmoduleSummary>,
152    /// Short git commit SHA (7 chars) at scan time, if the project is a git repo.
153    #[serde(default, skip_serializing_if = "Option::is_none")]
154    pub git_commit_short: Option<String>,
155    /// Full git commit SHA at scan time, if the project is a git repo.
156    #[serde(default, skip_serializing_if = "Option::is_none")]
157    pub git_commit_long: Option<String>,
158    /// Git branch active at scan time, if the project is a git repo.
159    #[serde(default, skip_serializing_if = "Option::is_none")]
160    pub git_branch: Option<String>,
161    /// Author of the last git commit at scan time.
162    #[serde(default, skip_serializing_if = "Option::is_none")]
163    pub git_commit_author: Option<String>,
164    /// Comma-separated git tags pointing at HEAD at scan time.
165    #[serde(default, skip_serializing_if = "Option::is_none")]
166    pub git_tags: Option<String>,
167}
168
169fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
170    std::process::Command::new("git")
171        .args(args)
172        .current_dir(dir)
173        .output()
174        .ok()
175        .filter(|o| o.status.success())
176        .and_then(|o| String::from_utf8(o.stdout).ok())
177        .map(|s| s.trim().to_string())
178        .filter(|s| !s.is_empty())
179}
180
181#[derive(Default)]
182struct GitInfo {
183    commit_short: Option<String>,
184    commit_long: Option<String>,
185    branch: Option<String>,
186    author: Option<String>,
187    tags: Option<String>,
188}
189
190fn detect_git_for_run(project_path: &Path) -> GitInfo {
191    GitInfo {
192        commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
193        commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
194        branch: run_git_in(project_path, &["branch", "--show-current"]),
195        author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
196        tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
197            t.lines()
198                .filter(|l| !l.is_empty())
199                .collect::<Vec<_>>()
200                .join(", ")
201        }),
202    }
203}
204
205fn get_current_username() -> String {
206    std::env::var("USERNAME")
207        .or_else(|_| std::env::var("USER"))
208        .unwrap_or_else(|_| "unknown".to_string())
209}
210
211fn get_hostname() -> String {
212    std::env::var("COMPUTERNAME")
213        .or_else(|_| std::env::var("HOSTNAME"))
214        .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
215        .unwrap_or_else(|_| "unknown".to_string())
216}
217
218pub fn analyze(config: &AppConfig, runtime_mode: &str) -> Result<AnalysisRun> {
219    config.validate()?;
220
221    if config.discovery.root_paths.is_empty() {
222        anyhow::bail!("no input paths were provided");
223    }
224
225    let include_globs = compile_globset(&config.discovery.include_globs)?;
226    let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
227    let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
228
229    let mut analyzed = Vec::new();
230    let mut skipped = Vec::new();
231    let mut warnings = Vec::new();
232    let mut seen_paths = HashSet::new();
233
234    for root in &config.discovery.root_paths {
235        let root = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
236
237        if root.is_file() {
238            if let Some(record) = analyze_candidate_file(
239                &root,
240                root.parent().unwrap_or(Path::new(".")),
241                config,
242                &include_globs,
243                &exclude_globs,
244                &enabled_languages,
245            )? {
246                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
247            }
248            continue;
249        }
250
251        let mut builder = WalkBuilder::new(&root);
252        builder
253            .follow_links(config.discovery.follow_symlinks)
254            .hidden(config.discovery.ignore_hidden_files)
255            .ignore(config.discovery.honor_ignore_files)
256            .parents(config.discovery.honor_ignore_files)
257            .git_ignore(config.discovery.honor_ignore_files)
258            .git_global(config.discovery.honor_ignore_files)
259            .git_exclude(config.discovery.honor_ignore_files);
260
261        for entry in builder.build() {
262            let entry = match entry {
263                Ok(entry) => entry,
264                Err(err) => {
265                    warnings.push(format!("discovery warning: {err}"));
266                    continue;
267                }
268            };
269
270            let path = entry.into_path();
271            if path.is_dir() {
272                continue;
273            }
274            if !seen_paths.insert(path.clone()) {
275                continue;
276            }
277
278            if let Some(record) = analyze_candidate_file(
279                &path,
280                &root,
281                config,
282                &include_globs,
283                &exclude_globs,
284                &enabled_languages,
285            )? {
286                push_record(record, &mut analyzed, &mut skipped, &mut warnings);
287            }
288        }
289    }
290
291    analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
292    skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
293
294    // Submodule detection: label each file with its submodule and build per-submodule summaries.
295    let submodule_summaries = if config.discovery.submodule_breakdown {
296        let root = config.discovery.root_paths[0]
297            .canonicalize()
298            .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
299        let submodules = detect_submodules(&root);
300        if !submodules.is_empty() {
301            for file in &mut analyzed {
302                for (name, sub_path) in &submodules {
303                    let prefix = sub_path.to_string_lossy().replace('\\', "/");
304                    let rel = &file.relative_path;
305                    if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
306                        file.submodule = Some(name.clone());
307                        break;
308                    }
309                }
310            }
311            build_submodule_summaries(&analyzed, &submodules)
312        } else {
313            Vec::new()
314        }
315    } else {
316        Vec::new()
317    };
318
319    let summary = build_summary(&analyzed, &skipped);
320    let language_summaries = build_language_summaries(&analyzed);
321
322    // Detect git info from the first root to drive run_id and enrich the result.
323    let first_root = config
324        .discovery
325        .root_paths
326        .first()
327        .map(|p| p.canonicalize().unwrap_or_else(|_| p.to_path_buf()));
328    let git = first_root
329        .as_deref()
330        .map(detect_git_for_run)
331        .unwrap_or_default();
332
333    let now = Utc::now();
334    let run_id = {
335        let uuid_suffix = Uuid::new_v4().simple().to_string();
336        format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
337    };
338
339    Ok(AnalysisRun {
340        tool: ToolMetadata {
341            name: "sloc".into(),
342            version: env!("CARGO_PKG_VERSION").into(),
343            run_id,
344            timestamp_utc: now,
345        },
346        environment: EnvironmentMetadata {
347            operating_system: std::env::consts::OS.into(),
348            architecture: std::env::consts::ARCH.into(),
349            runtime_mode: runtime_mode.into(),
350            initiator_username: get_current_username(),
351            initiator_hostname: get_hostname(),
352        },
353        effective_configuration: config.clone(),
354        input_roots: config
355            .discovery
356            .root_paths
357            .iter()
358            .map(|p| path_to_string(p))
359            .collect(),
360        summary_totals: summary,
361        totals_by_language: language_summaries,
362        per_file_records: analyzed,
363        skipped_file_records: skipped,
364        warnings,
365        submodule_summaries,
366        git_commit_short: git.commit_short,
367        git_commit_long: git.commit_long,
368        git_branch: git.branch,
369        git_commit_author: git.author,
370        git_tags: git.tags,
371    })
372}
373
374fn push_record(
375    record: FileRecord,
376    analyzed: &mut Vec<FileRecord>,
377    skipped: &mut Vec<FileRecord>,
378    warnings: &mut Vec<String>,
379) {
380    warnings.extend(
381        record
382            .warnings
383            .iter()
384            .map(|warning| format!("{}: {warning}", record.relative_path)),
385    );
386
387    match record.status {
388        FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
389        _ => skipped.push(record),
390    }
391}
392
393fn analyze_candidate_file(
394    path: &Path,
395    root: &Path,
396    config: &AppConfig,
397    include_globs: &Option<GlobSet>,
398    exclude_globs: &Option<GlobSet>,
399    enabled_languages: &Option<BTreeSet<Language>>,
400) -> Result<Option<FileRecord>> {
401    let metadata = match fs::symlink_metadata(path) {
402        Ok(metadata) => metadata,
403        Err(err) => {
404            return Ok(Some(skipped_record(
405                path,
406                root,
407                0,
408                FileStatus::ErrorInternal,
409                vec![format!("failed to read metadata: {err}")],
410            )));
411        }
412    };
413
414    if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
415        return Ok(Some(skipped_record(
416            path,
417            root,
418            metadata.len(),
419            FileStatus::SkippedByPolicy,
420            vec!["symlink skipped by policy".into()],
421        )));
422    }
423
424    let relative_path = relative_path_string(path, root);
425
426    if file_name_eq(path, ".gitignore") {
427        return Ok(Some(skipped_record(
428            path,
429            root,
430            metadata.len(),
431            FileStatus::SkippedByPolicy,
432            vec![".gitignore is always excluded".into()],
433        )));
434    }
435
436    if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
437        return Ok(Some(skipped_record(
438            path,
439            root,
440            metadata.len(),
441            FileStatus::SkippedByPolicy,
442            vec!["path matched excluded directory setting".into()],
443        )));
444    }
445
446    if metadata.len() > config.discovery.max_file_size_bytes {
447        return Ok(Some(skipped_record(
448            path,
449            root,
450            metadata.len(),
451            FileStatus::SkippedByPolicy,
452            vec![format!(
453                "file exceeded max_file_size_bytes ({})",
454                config.discovery.max_file_size_bytes
455            )],
456        )));
457    }
458
459    if let Some(globs) = include_globs {
460        if !globs.is_match(Path::new(&relative_path)) && !globs.is_match(path) {
461            return Ok(None);
462        }
463    }
464
465    if let Some(globs) = exclude_globs {
466        if globs.is_match(Path::new(&relative_path)) || globs.is_match(path) {
467            return Ok(Some(skipped_record(
468                path,
469                root,
470                metadata.len(),
471                FileStatus::SkippedByPolicy,
472                vec!["path matched exclude glob".into()],
473            )));
474        }
475    }
476
477    if is_known_lockfile(path) && !config.analysis.include_lockfiles {
478        return Ok(Some(skipped_record(
479            path,
480            root,
481            metadata.len(),
482            FileStatus::SkippedByPolicy,
483            vec!["lockfile skipped by default policy".into()],
484        )));
485    }
486
487    let bytes = match fs::read(path) {
488        Ok(bytes) => bytes,
489        Err(err) => {
490            return Ok(Some(skipped_record(
491                path,
492                root,
493                metadata.len(),
494                FileStatus::ErrorInternal,
495                vec![format!("failed to read file: {err}")],
496            )));
497        }
498    };
499
500    let vendor = is_vendor_path(path);
501    if vendor && config.analysis.vendor_directory_detection {
502        return Ok(Some(skipped_record(
503            path,
504            root,
505            metadata.len(),
506            FileStatus::SkippedByPolicy,
507            vec!["vendor file skipped by policy".into()],
508        )));
509    }
510
511    let generated = config.analysis.generated_file_detection && looks_generated(path, &bytes);
512    if generated {
513        return Ok(Some(skipped_record(
514            path,
515            root,
516            metadata.len(),
517            FileStatus::SkippedByPolicy,
518            vec!["generated file skipped by policy".into()],
519        )));
520    }
521
522    let minified = config.analysis.minified_file_detection && looks_minified(path, &bytes);
523    if minified {
524        return Ok(Some(skipped_record(
525            path,
526            root,
527            metadata.len(),
528            FileStatus::SkippedByPolicy,
529            vec!["minified file skipped by policy".into()],
530        )));
531    }
532
533    if is_binary(&bytes) {
534        return match config.analysis.binary_file_behavior {
535            BinaryFileBehavior::Skip => Ok(Some(skipped_record(
536                path,
537                root,
538                metadata.len(),
539                FileStatus::SkippedBinary,
540                vec!["binary file skipped by default".into()],
541            ))),
542            BinaryFileBehavior::Fail => {
543                anyhow::bail!("binary file encountered: {}", path.display())
544            }
545        };
546    }
547
548    let (text, encoding, decode_warnings) = match decode_bytes(&bytes) {
549        Ok(result) => result,
550        Err(err) => {
551            return match config.analysis.decode_failure_behavior {
552                FailureBehavior::WarnSkip => Ok(Some(skipped_record(
553                    path,
554                    root,
555                    metadata.len(),
556                    FileStatus::SkippedDecodeError,
557                    vec![err],
558                ))),
559                FailureBehavior::Fail => {
560                    anyhow::bail!("decode failure for {}: {err}", path.display())
561                }
562            };
563        }
564    };
565
566    let first_line = text.lines().next();
567    let language = detect_language(
568        path,
569        first_line,
570        &config.analysis.extension_overrides,
571        config.analysis.shebang_detection,
572    );
573
574    let Some(language) = language else {
575        return Ok(Some(skipped_record(
576            path,
577            root,
578            metadata.len(),
579            FileStatus::SkippedUnsupported,
580            vec!["unsupported or undetected language".into()],
581        )));
582    };
583
584    if let Some(enabled) = enabled_languages {
585        if !enabled.contains(&language) {
586            return Ok(Some(skipped_record(
587                path,
588                root,
589                metadata.len(),
590                FileStatus::SkippedByPolicy,
591                vec![format!(
592                    "language {} disabled by configuration",
593                    language.display_name()
594                )],
595            )));
596        }
597    }
598
599    let ieee_opts = AnalysisOptions {
600        blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
601            == BlankInBlockCommentPolicy::CountAsComment,
602        collapse_continuation_lines: config.analysis.continuation_line_policy
603            == ContinuationLinePolicy::CollapseToLogical,
604    };
605    let analysis = analyze_text(language, &text, ieee_opts);
606    let effective_counts = compute_effective_counts(
607        &analysis.raw,
608        config.analysis.mixed_line_policy,
609        config.analysis.python_docstrings_as_comments,
610        config.analysis.count_compiler_directives,
611    );
612
613    let mut warnings = decode_warnings;
614    warnings.extend(analysis.warnings.clone());
615
616    Ok(Some(FileRecord {
617        path: path_to_string(path),
618        relative_path,
619        language: Some(language),
620        size_bytes: metadata.len(),
621        detected_encoding: Some(encoding),
622        raw_line_categories: analysis.raw,
623        effective_counts,
624        status: match analysis.parse_mode {
625            ParseMode::Lexical => FileStatus::AnalyzedExact,
626            ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
627            ParseMode::TreeSitter => FileStatus::AnalyzedExact,
628        },
629        warnings,
630        generated,
631        minified,
632        vendor,
633        parse_mode: Some(analysis.parse_mode),
634        submodule: None,
635    }))
636}
637
638fn compute_effective_counts(
639    raw: &RawLineCounts,
640    mixed_line_policy: MixedLinePolicy,
641    python_docstrings_as_comments: bool,
642    count_compiler_directives: bool,
643) -> EffectiveCounts {
644    let mut effective = EffectiveCounts {
645        code_lines: raw.code_only_lines,
646        comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
647        blank_lines: raw.blank_only_lines,
648        mixed_lines_separate: 0,
649    };
650
651    if python_docstrings_as_comments {
652        effective.comment_lines += raw.docstring_comment_lines;
653    } else {
654        effective.code_lines += raw.docstring_comment_lines;
655    }
656
657    let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
658    match mixed_line_policy {
659        MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
660        MixedLinePolicy::CodeAndComment => {
661            effective.code_lines += mixed_total;
662            effective.comment_lines += mixed_total;
663        }
664        MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
665        MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
666    }
667
668    // IEEE 1045-1992 §4.2: optionally exclude preprocessor/compiler directives from code SLOC.
669    // compiler_directive_lines is a subset of code_only_lines, so subtract it directly.
670    if !count_compiler_directives {
671        effective.code_lines = effective
672            .code_lines
673            .saturating_sub(raw.compiler_directive_lines);
674    }
675
676    effective
677}
678
679fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
680    let mut summary = SummaryTotals {
681        files_considered: (analyzed.len() + skipped.len()) as u64,
682        files_analyzed: analyzed.len() as u64,
683        files_skipped: skipped.len() as u64,
684        ..Default::default()
685    };
686
687    for record in analyzed {
688        summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
689        summary.code_lines += record.effective_counts.code_lines;
690        summary.comment_lines += record.effective_counts.comment_lines;
691        summary.blank_lines += record.effective_counts.blank_lines;
692        summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
693        summary.functions += record.raw_line_categories.functions;
694        summary.classes += record.raw_line_categories.classes;
695        summary.variables += record.raw_line_categories.variables;
696        summary.imports += record.raw_line_categories.imports;
697    }
698
699    summary
700}
701
702fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
703    let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
704    for record in analyzed {
705        let Some(language) = record.language else {
706            continue;
707        };
708        let entry = by_language.entry(language).or_insert(LanguageSummary {
709            language,
710            files: 0,
711            total_physical_lines: 0,
712            code_lines: 0,
713            comment_lines: 0,
714            blank_lines: 0,
715            mixed_lines_separate: 0,
716            functions: 0,
717            classes: 0,
718            variables: 0,
719            imports: 0,
720        });
721        entry.files += 1;
722        entry.total_physical_lines += record.raw_line_categories.total_physical_lines;
723        entry.code_lines += record.effective_counts.code_lines;
724        entry.comment_lines += record.effective_counts.comment_lines;
725        entry.blank_lines += record.effective_counts.blank_lines;
726        entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
727        entry.functions += record.raw_line_categories.functions;
728        entry.classes += record.raw_line_categories.classes;
729        entry.variables += record.raw_line_categories.variables;
730        entry.imports += record.raw_line_categories.imports;
731    }
732
733    by_language.into_values().collect()
734}
735
736fn skipped_record(
737    path: &Path,
738    root: &Path,
739    size_bytes: u64,
740    status: FileStatus,
741    warnings: Vec<String>,
742) -> FileRecord {
743    FileRecord {
744        path: path_to_string(path),
745        relative_path: relative_path_string(path, root),
746        language: None,
747        size_bytes,
748        detected_encoding: None,
749        raw_line_categories: RawLineCounts::default(),
750        effective_counts: EffectiveCounts::default(),
751        status,
752        warnings,
753        generated: false,
754        minified: false,
755        vendor: false,
756        parse_mode: None,
757        submodule: None,
758    }
759}
760
761fn relative_path_string(path: &Path, root: &Path) -> String {
762    path.strip_prefix(root)
763        .unwrap_or(path)
764        .to_string_lossy()
765        .replace('\\', "/")
766}
767
768fn path_to_string(path: &Path) -> String {
769    path.to_string_lossy().replace('\\', "/")
770}
771
772/// Parse `.gitmodules` in `root` and return `(name, relative_path)` for each submodule found.
773pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
774    let gitmodules = root.join(".gitmodules");
775    if !gitmodules.is_file() {
776        return Vec::new();
777    }
778    let content = match fs::read_to_string(&gitmodules) {
779        Ok(c) => c,
780        Err(_) => return Vec::new(),
781    };
782
783    let mut result = Vec::new();
784    let mut current_name: Option<String> = None;
785    let mut current_path: Option<PathBuf> = None;
786
787    for line in content.lines() {
788        let trimmed = line.trim();
789        if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
790            if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
791                result.push((name, path));
792            }
793            let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
794            current_name = Some(name);
795        } else if let Some(rest) = trimmed.strip_prefix("path") {
796            if let Some(eq_pos) = rest.find('=') {
797                let path_str = rest[eq_pos + 1..].trim();
798                current_path = Some(PathBuf::from(path_str));
799            }
800        }
801    }
802    if let (Some(name), Some(path)) = (current_name, current_path) {
803        result.push((name, path));
804    }
805
806    result
807}
808
809fn build_submodule_summaries(
810    analyzed: &[FileRecord],
811    submodules: &[(String, PathBuf)],
812) -> Vec<SubmoduleSummary> {
813    submodules
814        .iter()
815        .map(|(name, path)| {
816            let files: Vec<&FileRecord> = analyzed
817                .iter()
818                .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
819                .collect();
820
821            let files_analyzed = files.len() as u64;
822            let total_physical_lines = files
823                .iter()
824                .map(|f| f.raw_line_categories.total_physical_lines)
825                .sum();
826            let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
827            let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
828            let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
829            let language_summaries = build_language_summaries_from_slice(&files);
830
831            SubmoduleSummary {
832                name: name.clone(),
833                relative_path: path.to_string_lossy().replace('\\', "/"),
834                files_analyzed,
835                total_physical_lines,
836                code_lines,
837                comment_lines,
838                blank_lines,
839                language_summaries,
840            }
841        })
842        .filter(|s| s.files_analyzed > 0)
843        .collect()
844}
845
846fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
847    let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
848    for file in files {
849        if let Some(lang) = file.language {
850            let entry = map
851                .entry(lang.display_name().to_string())
852                .or_insert_with(|| LanguageSummary {
853                    language: lang,
854                    files: 0,
855                    total_physical_lines: 0,
856                    code_lines: 0,
857                    comment_lines: 0,
858                    blank_lines: 0,
859                    mixed_lines_separate: 0,
860                    functions: 0,
861                    classes: 0,
862                    variables: 0,
863                    imports: 0,
864                });
865            entry.files += 1;
866            let r = &file.raw_line_categories;
867            entry.total_physical_lines += r.total_physical_lines;
868            entry.code_lines += file.effective_counts.code_lines;
869            entry.comment_lines += file.effective_counts.comment_lines;
870            entry.blank_lines += file.effective_counts.blank_lines;
871            entry.mixed_lines_separate += file.effective_counts.mixed_lines_separate;
872        }
873    }
874    map.into_values().collect()
875}
876
877fn file_name_eq(path: &Path, expected: &str) -> bool {
878    path.file_name()
879        .and_then(|name| name.to_str())
880        .map(|name| name == expected)
881        .unwrap_or(false)
882}
883
884fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
885    path.components().any(|component| {
886        component
887            .as_os_str()
888            .to_str()
889            .map(|part| excluded_dirs.iter().any(|excluded| excluded == part))
890            .unwrap_or(false)
891    })
892}
893
894fn is_vendor_path(path: &Path) -> bool {
895    path.components().any(|component| {
896        component
897            .as_os_str()
898            .to_str()
899            .map(|part| matches!(part, "vendor" | "node_modules" | "packages"))
900            .unwrap_or(false)
901    })
902}
903
904fn is_known_lockfile(path: &Path) -> bool {
905    path.file_name()
906        .and_then(|name| name.to_str())
907        .map(|name| {
908            matches!(
909                name,
910                "Cargo.lock"
911                    | "package-lock.json"
912                    | "yarn.lock"
913                    | "pnpm-lock.yaml"
914                    | "Pipfile.lock"
915                    | "poetry.lock"
916                    | "composer.lock"
917            )
918        })
919        .unwrap_or(false)
920}
921
922fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
923    let file_name = path
924        .file_name()
925        .and_then(|name| name.to_str())
926        .unwrap_or_default();
927    if file_name.contains(".generated.") || file_name.contains(".g.") {
928        return true;
929    }
930
931    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(1024)]).to_ascii_lowercase();
932    sample.contains("@generated") || sample.contains("generated by")
933}
934
935fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
936    let file_name = path
937        .file_name()
938        .and_then(|name| name.to_str())
939        .unwrap_or_default();
940    if file_name.contains(".min.") {
941        return true;
942    }
943
944    let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(4096)]);
945    let longest_line = sample.lines().map(|line| line.len()).max().unwrap_or(0);
946    let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
947    longest_line > 2000 && whitespace * 100 < sample.len().max(1)
948}
949
950fn is_binary(bytes: &[u8]) -> bool {
951    if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
952        || bytes.starts_with(&[0xFF, 0xFE])
953        || bytes.starts_with(&[0xFE, 0xFF])
954    {
955        return false;
956    }
957
958    let sample = &bytes[..bytes.len().min(8192)];
959    sample.contains(&0)
960}
961
962fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
963    if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
964        let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
965        return Ok((text, "utf-8-bom".into(), vec![]));
966    }
967
968    if bytes.starts_with(&[0xFF, 0xFE]) {
969        let (cow, _, had_errors) = UTF_16LE.decode(&bytes[2..]);
970        let mut warnings = Vec::new();
971        if had_errors {
972            warnings.push("utf-16le decode contained replacement characters".into());
973        }
974        return Ok((cow.into_owned(), "utf-16le".into(), warnings));
975    }
976
977    if bytes.starts_with(&[0xFE, 0xFF]) {
978        let (cow, _, had_errors) = UTF_16BE.decode(&bytes[2..]);
979        let mut warnings = Vec::new();
980        if had_errors {
981            warnings.push("utf-16be decode contained replacement characters".into());
982        }
983        return Ok((cow.into_owned(), "utf-16be".into(), warnings));
984    }
985
986    match String::from_utf8(bytes.to_vec()) {
987        Ok(text) => Ok((text, "utf-8".into(), vec![])),
988        Err(_) => {
989            let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
990            let mut warnings = vec!["decoded using windows-1252 fallback".into()];
991            if had_errors {
992                warnings.push("fallback decode contained replacement characters".into());
993            }
994            Ok((cow.into_owned(), "windows-1252".into(), warnings))
995        }
996    }
997}
998
999fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1000    if patterns.is_empty() {
1001        return Ok(None);
1002    }
1003
1004    let mut builder = GlobSetBuilder::new();
1005    for pattern in patterns {
1006        builder
1007            .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1008    }
1009    Ok(Some(
1010        builder.build().context("failed to compile glob filters")?,
1011    ))
1012}
1013
1014fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1015    if enabled.is_empty() {
1016        return Ok(None);
1017    }
1018
1019    let supported = supported_languages();
1020    let mut set = BTreeSet::new();
1021    for name in enabled {
1022        let language = Language::from_name(name)
1023            .with_context(|| format!("unsupported language in config: {name}"))?;
1024        if !supported.contains(&language) {
1025            anyhow::bail!("language {name} is not supported in this build");
1026        }
1027        set.insert(language);
1028    }
1029    Ok(Some(set))
1030}
1031
1032pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1033    let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1034    fs::write(output_path, json)
1035        .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1036}
1037
1038pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1039    let contents = fs::read_to_string(path)
1040        .with_context(|| format!("failed to read result file {}", path.display()))?;
1041    serde_json::from_str(&contents)
1042        .with_context(|| format!("failed to parse JSON result {}", path.display()))
1043}
1044
1045#[cfg(test)]
1046mod tests {
1047    use super::*;
1048
1049    #[test]
1050    fn effective_counts_respect_code_only_policy() {
1051        let raw = RawLineCounts {
1052            code_only_lines: 2,
1053            single_comment_only_lines: 1,
1054            mixed_code_single_comment_lines: 3,
1055            docstring_comment_lines: 2,
1056            ..RawLineCounts::default()
1057        };
1058        let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1059        assert_eq!(counts.code_lines, 5);
1060        assert_eq!(counts.comment_lines, 3);
1061    }
1062
1063    #[test]
1064    fn effective_counts_can_separate_mixed() {
1065        let raw = RawLineCounts {
1066            mixed_code_single_comment_lines: 2,
1067            mixed_code_multi_comment_lines: 1,
1068            ..RawLineCounts::default()
1069        };
1070        let counts =
1071            compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1072        assert_eq!(counts.mixed_lines_separate, 3);
1073        assert_eq!(counts.code_lines, 0);
1074        assert_eq!(counts.comment_lines, 0);
1075    }
1076
1077    #[test]
1078    fn windows_1252_fallback_decodes() {
1079        let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1080        let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1081        assert_eq!(encoding, "windows-1252");
1082        assert!(text.contains('–'));
1083        assert!(!warnings.is_empty());
1084    }
1085}