1#![allow(clippy::multiple_crate_versions)]
4
5pub mod delta;
6pub mod history;
7pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
8pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot};
9
10use std::collections::{BTreeMap, BTreeSet, HashSet};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14use anyhow::{Context, Result};
15use chrono::{DateTime, Utc};
16use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use ignore::WalkBuilder;
19use serde::{Deserialize, Serialize};
20use uuid::Uuid;
21
22use sloc_config::{
23 AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
24 FailureBehavior, MixedLinePolicy,
25};
26use sloc_languages::{
27 analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
28 RawLineCounts,
29};
30
31enum MetadataPolicyOutcome {
33 Skip(Box<FileRecord>),
35 Exclude,
37 Continue,
39}
40
41#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
42#[serde(rename_all = "snake_case")]
43pub enum FileStatus {
44 AnalyzedExact,
45 AnalyzedBestEffort,
46 SkippedBinary,
47 SkippedDecodeError,
48 SkippedUnsupported,
49 SkippedByPolicy,
50 ErrorInternal,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize, Default)]
54pub struct EffectiveCounts {
55 pub code_lines: u64,
56 pub comment_lines: u64,
57 pub blank_lines: u64,
58 pub mixed_lines_separate: u64,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ToolMetadata {
63 pub name: String,
64 pub version: String,
65 pub run_id: String,
66 pub timestamp_utc: DateTime<Utc>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct EnvironmentMetadata {
71 pub operating_system: String,
72 pub architecture: String,
73 pub runtime_mode: String,
74 pub initiator_username: String,
75 pub initiator_hostname: String,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, Default)]
79pub struct SummaryTotals {
80 pub files_considered: u64,
81 pub files_analyzed: u64,
82 pub files_skipped: u64,
83 pub total_physical_lines: u64,
84 pub code_lines: u64,
85 pub comment_lines: u64,
86 pub blank_lines: u64,
87 pub mixed_lines_separate: u64,
88 #[serde(default)]
89 pub functions: u64,
90 #[serde(default)]
91 pub classes: u64,
92 #[serde(default)]
93 pub variables: u64,
94 #[serde(default)]
95 pub imports: u64,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct LanguageSummary {
100 pub language: Language,
101 pub files: u64,
102 pub total_physical_lines: u64,
103 pub code_lines: u64,
104 pub comment_lines: u64,
105 pub blank_lines: u64,
106 pub mixed_lines_separate: u64,
107 #[serde(default)]
108 pub functions: u64,
109 #[serde(default)]
110 pub classes: u64,
111 #[serde(default)]
112 pub variables: u64,
113 #[serde(default)]
114 pub imports: u64,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct FileRecord {
119 pub path: String,
120 pub relative_path: String,
121 pub language: Option<Language>,
122 pub size_bytes: u64,
123 pub detected_encoding: Option<String>,
124 pub raw_line_categories: RawLineCounts,
125 pub effective_counts: EffectiveCounts,
126 pub status: FileStatus,
127 pub warnings: Vec<String>,
128 pub generated: bool,
129 pub minified: bool,
130 pub vendor: bool,
131 pub parse_mode: Option<ParseMode>,
132 #[serde(skip_serializing_if = "Option::is_none")]
133 pub submodule: Option<String>,
134}
135
136#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct SubmoduleSummary {
139 pub name: String,
140 pub relative_path: String,
141 pub files_analyzed: u64,
142 pub total_physical_lines: u64,
143 pub code_lines: u64,
144 pub comment_lines: u64,
145 pub blank_lines: u64,
146 pub language_summaries: Vec<LanguageSummary>,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct AnalysisRun {
151 pub tool: ToolMetadata,
152 pub environment: EnvironmentMetadata,
153 pub effective_configuration: AppConfig,
154 pub input_roots: Vec<String>,
155 pub summary_totals: SummaryTotals,
156 pub totals_by_language: Vec<LanguageSummary>,
157 pub per_file_records: Vec<FileRecord>,
158 pub skipped_file_records: Vec<FileRecord>,
159 pub warnings: Vec<String>,
160 #[serde(default, skip_serializing_if = "Vec::is_empty")]
162 pub submodule_summaries: Vec<SubmoduleSummary>,
163 #[serde(default, skip_serializing_if = "Option::is_none")]
165 pub git_commit_short: Option<String>,
166 #[serde(default, skip_serializing_if = "Option::is_none")]
168 pub git_commit_long: Option<String>,
169 #[serde(default, skip_serializing_if = "Option::is_none")]
171 pub git_branch: Option<String>,
172 #[serde(default, skip_serializing_if = "Option::is_none")]
174 pub git_commit_author: Option<String>,
175 #[serde(default, skip_serializing_if = "Option::is_none")]
177 pub git_tags: Option<String>,
178}
179
180fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
181 std::process::Command::new("git")
182 .args(args)
183 .current_dir(dir)
184 .output()
185 .ok()
186 .filter(|o| o.status.success())
187 .and_then(|o| String::from_utf8(o.stdout).ok())
188 .map(|s| s.trim().to_string())
189 .filter(|s| !s.is_empty())
190}
191
192#[derive(Default)]
193struct GitInfo {
194 commit_short: Option<String>,
195 commit_long: Option<String>,
196 branch: Option<String>,
197 author: Option<String>,
198 tags: Option<String>,
199}
200
201fn detect_git_for_run(project_path: &Path) -> GitInfo {
202 GitInfo {
203 commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
204 commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
205 branch: run_git_in(project_path, &["branch", "--show-current"]),
206 author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
207 tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
208 t.lines()
209 .filter(|l| !l.is_empty())
210 .collect::<Vec<_>>()
211 .join(", ")
212 }),
213 }
214}
215
216fn get_current_username() -> String {
217 std::env::var("USERNAME")
218 .or_else(|_| std::env::var("USER"))
219 .unwrap_or_else(|_| "unknown".to_string())
220}
221
222fn get_hostname() -> String {
223 std::env::var("COMPUTERNAME")
224 .or_else(|_| std::env::var("HOSTNAME"))
225 .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
226 .unwrap_or_else(|_| "unknown".to_string())
227}
228
229#[allow(clippy::too_many_arguments)]
231fn walk_root(
232 root: &Path,
233 config: &AppConfig,
234 include_globs: Option<&GlobSet>,
235 exclude_globs: Option<&GlobSet>,
236 enabled_languages: Option<&BTreeSet<Language>>,
237 seen_paths: &mut HashSet<PathBuf>,
238 analyzed: &mut Vec<FileRecord>,
239 skipped: &mut Vec<FileRecord>,
240 warnings: &mut Vec<String>,
241) -> Result<()> {
242 let mut builder = WalkBuilder::new(root);
243 builder
244 .follow_links(config.discovery.follow_symlinks)
245 .hidden(config.discovery.ignore_hidden_files)
246 .ignore(config.discovery.honor_ignore_files)
247 .parents(config.discovery.honor_ignore_files)
248 .git_ignore(config.discovery.honor_ignore_files)
249 .git_global(config.discovery.honor_ignore_files)
250 .git_exclude(config.discovery.honor_ignore_files);
251
252 for entry in builder.build() {
253 let entry = match entry {
254 Ok(entry) => entry,
255 Err(err) => {
256 warnings.push(format!("discovery warning: {err}"));
257 continue;
258 }
259 };
260
261 let path = entry.into_path();
262 if path.is_dir() || !seen_paths.insert(path.clone()) {
263 continue;
264 }
265
266 if let Some(record) = analyze_candidate_file(
267 &path,
268 root,
269 config,
270 include_globs,
271 exclude_globs,
272 enabled_languages,
273 )? {
274 push_record(record, analyzed, skipped, warnings);
275 }
276 }
277
278 Ok(())
279}
280
281fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
283 let root = config.discovery.root_paths[0]
284 .canonicalize()
285 .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
286 let submodules = detect_submodules(&root);
287 if submodules.is_empty() {
288 return Vec::new();
289 }
290
291 for file in analyzed.iter_mut() {
292 for (name, sub_path) in &submodules {
293 let prefix = sub_path.to_string_lossy().replace('\\', "/");
294 let rel = &file.relative_path;
295 if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
296 file.submodule = Some(name.clone());
297 break;
298 }
299 }
300 }
301
302 build_submodule_summaries(analyzed, &submodules)
303}
304
305fn assemble_run(
307 config: &AppConfig,
308 runtime_mode: &str,
309 analyzed: Vec<FileRecord>,
310 skipped: Vec<FileRecord>,
311 warnings: Vec<String>,
312 submodule_summaries: Vec<SubmoduleSummary>,
313) -> AnalysisRun {
314 let summary = build_summary(&analyzed, &skipped);
315 let language_summaries = build_language_summaries(&analyzed);
316
317 let first_root = config
318 .discovery
319 .root_paths
320 .first()
321 .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
322 let git = first_root
323 .as_deref()
324 .map(detect_git_for_run)
325 .unwrap_or_default();
326
327 let now = Utc::now();
328 let run_id = {
329 let uuid_suffix = Uuid::new_v4().simple().to_string();
330 format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
331 };
332
333 AnalysisRun {
334 tool: ToolMetadata {
335 name: "sloc".into(),
336 version: env!("CARGO_PKG_VERSION").into(),
337 run_id,
338 timestamp_utc: now,
339 },
340 environment: EnvironmentMetadata {
341 operating_system: std::env::consts::OS.into(),
342 architecture: std::env::consts::ARCH.into(),
343 runtime_mode: runtime_mode.into(),
344 initiator_username: get_current_username(),
345 initiator_hostname: get_hostname(),
346 },
347 effective_configuration: config.clone(),
348 input_roots: config
349 .discovery
350 .root_paths
351 .iter()
352 .map(|p| path_to_string(p))
353 .collect(),
354 summary_totals: summary,
355 totals_by_language: language_summaries,
356 per_file_records: analyzed,
357 skipped_file_records: skipped,
358 warnings,
359 submodule_summaries,
360 git_commit_short: git.commit_short,
361 git_commit_long: git.commit_long,
362 git_branch: git.branch,
363 git_commit_author: git.author,
364 git_tags: git.tags,
365 }
366}
367
368#[allow(clippy::too_many_lines)]
373pub fn analyze(config: &AppConfig, runtime_mode: &str) -> Result<AnalysisRun> {
374 config.validate()?;
375
376 if config.discovery.root_paths.is_empty() {
377 anyhow::bail!("no input paths were provided");
378 }
379
380 let include_globs = compile_globset(&config.discovery.include_globs)?;
381 let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
382 let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
383
384 let mut analyzed = Vec::new();
385 let mut skipped = Vec::new();
386 let mut warnings = Vec::new();
387 let mut seen_paths = HashSet::new();
388
389 for root in &config.discovery.root_paths {
390 let root = root.canonicalize().unwrap_or_else(|_| root.clone());
391
392 if root.is_file() {
393 if let Some(record) = analyze_candidate_file(
394 &root,
395 root.parent().unwrap_or_else(|| Path::new(".")),
396 config,
397 include_globs.as_ref(),
398 exclude_globs.as_ref(),
399 enabled_languages.as_ref(),
400 )? {
401 push_record(record, &mut analyzed, &mut skipped, &mut warnings);
402 }
403 continue;
404 }
405
406 walk_root(
407 &root,
408 config,
409 include_globs.as_ref(),
410 exclude_globs.as_ref(),
411 enabled_languages.as_ref(),
412 &mut seen_paths,
413 &mut analyzed,
414 &mut skipped,
415 &mut warnings,
416 )?;
417 }
418
419 analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
420 skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
421
422 let submodule_summaries = if config.discovery.submodule_breakdown {
424 process_submodules(config, &mut analyzed)
425 } else {
426 Vec::new()
427 };
428
429 Ok(assemble_run(
430 config,
431 runtime_mode,
432 analyzed,
433 skipped,
434 warnings,
435 submodule_summaries,
436 ))
437}
438
439fn push_record(
440 record: FileRecord,
441 analyzed: &mut Vec<FileRecord>,
442 skipped: &mut Vec<FileRecord>,
443 warnings: &mut Vec<String>,
444) {
445 warnings.extend(
446 record
447 .warnings
448 .iter()
449 .map(|warning| format!("{}: {warning}", record.relative_path)),
450 );
451
452 match record.status {
453 FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
454 _ => skipped.push(record),
455 }
456}
457
458#[allow(clippy::too_many_arguments)]
462fn check_metadata_policy(
463 path: &Path,
464 root: &Path,
465 relative_path: &str,
466 metadata: &fs::Metadata,
467 config: &AppConfig,
468 include_globs: Option<&GlobSet>,
469 exclude_globs: Option<&GlobSet>,
470) -> MetadataPolicyOutcome {
471 if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
472 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
473 path,
474 root,
475 metadata.len(),
476 FileStatus::SkippedByPolicy,
477 vec!["symlink skipped by policy".into()],
478 )));
479 }
480
481 if file_name_eq(path, ".gitignore") {
482 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
483 path,
484 root,
485 metadata.len(),
486 FileStatus::SkippedByPolicy,
487 vec![".gitignore is always excluded".into()],
488 )));
489 }
490
491 if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
492 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
493 path,
494 root,
495 metadata.len(),
496 FileStatus::SkippedByPolicy,
497 vec!["path matched excluded directory setting".into()],
498 )));
499 }
500
501 if metadata.len() > config.discovery.max_file_size_bytes {
502 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
503 path,
504 root,
505 metadata.len(),
506 FileStatus::SkippedByPolicy,
507 vec![format!(
508 "file exceeded max_file_size_bytes ({})",
509 config.discovery.max_file_size_bytes
510 )],
511 )));
512 }
513
514 if let Some(globs) = include_globs {
515 if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
516 return MetadataPolicyOutcome::Exclude;
517 }
518 }
519
520 if let Some(globs) = exclude_globs {
521 if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
522 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
523 path,
524 root,
525 metadata.len(),
526 FileStatus::SkippedByPolicy,
527 vec!["path matched exclude glob".into()],
528 )));
529 }
530 }
531
532 if is_known_lockfile(path) && !config.analysis.include_lockfiles {
533 return MetadataPolicyOutcome::Skip(Box::new(skipped_record(
534 path,
535 root,
536 metadata.len(),
537 FileStatus::SkippedByPolicy,
538 vec!["lockfile skipped by default policy".into()],
539 )));
540 }
541
542 MetadataPolicyOutcome::Continue
543}
544
545fn check_content_policy(
549 path: &Path,
550 root: &Path,
551 size_bytes: u64,
552 bytes: &[u8],
553 config: &AppConfig,
554) -> (bool, bool, bool, Option<FileRecord>) {
555 let vendor = is_vendor_path(path);
556 if vendor && config.analysis.vendor_directory_detection {
557 return (
558 vendor,
559 false,
560 false,
561 Some(skipped_record(
562 path,
563 root,
564 size_bytes,
565 FileStatus::SkippedByPolicy,
566 vec!["vendor file skipped by policy".into()],
567 )),
568 );
569 }
570
571 let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
572 if generated {
573 return (
574 vendor,
575 generated,
576 false,
577 Some(skipped_record(
578 path,
579 root,
580 size_bytes,
581 FileStatus::SkippedByPolicy,
582 vec!["generated file skipped by policy".into()],
583 )),
584 );
585 }
586
587 let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
588 if minified {
589 return (
590 vendor,
591 generated,
592 minified,
593 Some(skipped_record(
594 path,
595 root,
596 size_bytes,
597 FileStatus::SkippedByPolicy,
598 vec!["minified file skipped by policy".into()],
599 )),
600 );
601 }
602
603 (vendor, generated, minified, None)
604}
605
606fn decode_file_contents(
608 path: &Path,
609 root: &Path,
610 size_bytes: u64,
611 bytes: &[u8],
612 config: &AppConfig,
613) -> Result<Option<(String, String, Vec<String>)>> {
614 if is_binary(bytes) {
615 return match config.analysis.binary_file_behavior {
616 BinaryFileBehavior::Skip => Ok(None),
617 BinaryFileBehavior::Fail => {
618 anyhow::bail!("binary file encountered: {}", path.display())
619 }
620 };
621 }
622
623 match decode_bytes(bytes) {
624 Ok(result) => Ok(Some(result)),
625 Err(err) => match config.analysis.decode_failure_behavior {
626 FailureBehavior::WarnSkip => {
627 let _ = (path, root, size_bytes); Err(anyhow::anyhow!("__decode_warn__: {err}"))
632 }
633 FailureBehavior::Fail => {
634 anyhow::bail!("decode failure for {}: {err}", path.display())
635 }
636 },
637 }
638}
639
640#[allow(clippy::too_many_lines)]
641fn analyze_candidate_file(
642 path: &Path,
643 root: &Path,
644 config: &AppConfig,
645 include_globs: Option<&GlobSet>,
646 exclude_globs: Option<&GlobSet>,
647 enabled_languages: Option<&BTreeSet<Language>>,
648) -> Result<Option<FileRecord>> {
649 let metadata = match fs::symlink_metadata(path) {
650 Ok(metadata) => metadata,
651 Err(err) => {
652 return Ok(Some(skipped_record(
653 path,
654 root,
655 0,
656 FileStatus::ErrorInternal,
657 vec![format!("failed to read metadata: {err}")],
658 )));
659 }
660 };
661
662 let relative_path = relative_path_string(path, root);
663
664 match check_metadata_policy(
666 path,
667 root,
668 &relative_path,
669 &metadata,
670 config,
671 include_globs,
672 exclude_globs,
673 ) {
674 MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
675 MetadataPolicyOutcome::Exclude => return Ok(None),
676 MetadataPolicyOutcome::Continue => {}
677 }
678
679 let bytes = match fs::read(path) {
680 Ok(bytes) => bytes,
681 Err(err) => {
682 return Ok(Some(skipped_record(
683 path,
684 root,
685 metadata.len(),
686 FileStatus::ErrorInternal,
687 vec![format!("failed to read file: {err}")],
688 )));
689 }
690 };
691
692 let (vendor, generated, minified, skip_record) =
694 check_content_policy(path, root, metadata.len(), &bytes, config);
695 if let Some(record) = skip_record {
696 return Ok(Some(record));
697 }
698
699 let (text, encoding, decode_warnings) =
701 match decode_file_contents(path, root, metadata.len(), &bytes, config) {
702 Ok(Some(result)) => result,
703 Ok(None) => {
704 return Ok(Some(skipped_record(
705 path,
706 root,
707 metadata.len(),
708 FileStatus::SkippedBinary,
709 vec!["binary file skipped by default".into()],
710 )));
711 }
712 Err(err) => {
713 let msg = err.to_string();
714 if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
715 return Ok(Some(skipped_record(
716 path,
717 root,
718 metadata.len(),
719 FileStatus::SkippedDecodeError,
720 vec![warn_msg.to_string()],
721 )));
722 }
723 return Err(err);
724 }
725 };
726
727 let first_line = text.lines().next();
728 let language = detect_language(
729 path,
730 first_line,
731 &config.analysis.extension_overrides,
732 config.analysis.shebang_detection,
733 );
734
735 let Some(language) = language else {
736 return Ok(Some(skipped_record(
737 path,
738 root,
739 metadata.len(),
740 FileStatus::SkippedUnsupported,
741 vec!["unsupported or undetected language".into()],
742 )));
743 };
744
745 if let Some(enabled) = enabled_languages {
746 if !enabled.contains(&language) {
747 return Ok(Some(skipped_record(
748 path,
749 root,
750 metadata.len(),
751 FileStatus::SkippedByPolicy,
752 vec![format!(
753 "language {} disabled by configuration",
754 language.display_name()
755 )],
756 )));
757 }
758 }
759
760 let ieee_opts = AnalysisOptions {
761 blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
762 == BlankInBlockCommentPolicy::CountAsComment,
763 collapse_continuation_lines: config.analysis.continuation_line_policy
764 == ContinuationLinePolicy::CollapseToLogical,
765 };
766 let analysis = analyze_text(language, &text, ieee_opts);
767 let effective_counts = compute_effective_counts(
768 &analysis.raw,
769 config.analysis.mixed_line_policy,
770 config.analysis.python_docstrings_as_comments,
771 config.analysis.count_compiler_directives,
772 );
773
774 let mut warnings = decode_warnings;
775 warnings.extend(analysis.warnings.clone());
776
777 Ok(Some(FileRecord {
778 path: path_to_string(path),
779 relative_path,
780 language: Some(language),
781 size_bytes: metadata.len(),
782 detected_encoding: Some(encoding),
783 raw_line_categories: analysis.raw,
784 effective_counts,
785 status: match analysis.parse_mode {
786 ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
787 ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
788 },
789 warnings,
790 generated,
791 minified,
792 vendor,
793 parse_mode: Some(analysis.parse_mode),
794 submodule: None,
795 }))
796}
797
798const fn compute_effective_counts(
799 raw: &RawLineCounts,
800 mixed_line_policy: MixedLinePolicy,
801 python_docstrings_as_comments: bool,
802 count_compiler_directives: bool,
803) -> EffectiveCounts {
804 let mut effective = EffectiveCounts {
805 code_lines: raw.code_only_lines,
806 comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
807 blank_lines: raw.blank_only_lines,
808 mixed_lines_separate: 0,
809 };
810
811 if python_docstrings_as_comments {
812 effective.comment_lines += raw.docstring_comment_lines;
813 } else {
814 effective.code_lines += raw.docstring_comment_lines;
815 }
816
817 let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
818 match mixed_line_policy {
819 MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
820 MixedLinePolicy::CodeAndComment => {
821 effective.code_lines += mixed_total;
822 effective.comment_lines += mixed_total;
823 }
824 MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
825 MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
826 }
827
828 if !count_compiler_directives {
831 effective.code_lines = effective
832 .code_lines
833 .saturating_sub(raw.compiler_directive_lines);
834 }
835
836 effective
837}
838
839fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
840 let mut summary = SummaryTotals {
841 files_considered: (analyzed.len() + skipped.len()) as u64,
842 files_analyzed: analyzed.len() as u64,
843 files_skipped: skipped.len() as u64,
844 ..Default::default()
845 };
846
847 for record in analyzed {
848 summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
849 summary.code_lines += record.effective_counts.code_lines;
850 summary.comment_lines += record.effective_counts.comment_lines;
851 summary.blank_lines += record.effective_counts.blank_lines;
852 summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
853 summary.functions += record.raw_line_categories.functions;
854 summary.classes += record.raw_line_categories.classes;
855 summary.variables += record.raw_line_categories.variables;
856 summary.imports += record.raw_line_categories.imports;
857 }
858
859 summary
860}
861
862fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
863 let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
864 for record in analyzed {
865 let Some(language) = record.language else {
866 continue;
867 };
868 let entry = by_language.entry(language).or_insert(LanguageSummary {
869 language,
870 files: 0,
871 total_physical_lines: 0,
872 code_lines: 0,
873 comment_lines: 0,
874 blank_lines: 0,
875 mixed_lines_separate: 0,
876 functions: 0,
877 classes: 0,
878 variables: 0,
879 imports: 0,
880 });
881 entry.files += 1;
882 entry.total_physical_lines += record.raw_line_categories.total_physical_lines;
883 entry.code_lines += record.effective_counts.code_lines;
884 entry.comment_lines += record.effective_counts.comment_lines;
885 entry.blank_lines += record.effective_counts.blank_lines;
886 entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
887 entry.functions += record.raw_line_categories.functions;
888 entry.classes += record.raw_line_categories.classes;
889 entry.variables += record.raw_line_categories.variables;
890 entry.imports += record.raw_line_categories.imports;
891 }
892
893 by_language.into_values().collect()
894}
895
896fn skipped_record(
897 path: &Path,
898 root: &Path,
899 size_bytes: u64,
900 status: FileStatus,
901 warnings: Vec<String>,
902) -> FileRecord {
903 FileRecord {
904 path: path_to_string(path),
905 relative_path: relative_path_string(path, root),
906 language: None,
907 size_bytes,
908 detected_encoding: None,
909 raw_line_categories: RawLineCounts::default(),
910 effective_counts: EffectiveCounts::default(),
911 status,
912 warnings,
913 generated: false,
914 minified: false,
915 vendor: false,
916 parse_mode: None,
917 submodule: None,
918 }
919}
920
921fn relative_path_string(path: &Path, root: &Path) -> String {
922 path.strip_prefix(root)
923 .unwrap_or(path)
924 .to_string_lossy()
925 .replace('\\', "/")
926}
927
928fn path_to_string(path: &Path) -> String {
929 path.to_string_lossy().replace('\\', "/")
930}
931
932#[must_use]
934pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
935 let gitmodules = root.join(".gitmodules");
936 if !gitmodules.is_file() {
937 return Vec::new();
938 }
939 let Ok(content) = fs::read_to_string(&gitmodules) else {
940 return Vec::new();
941 };
942
943 let mut result = Vec::new();
944 let mut current_name: Option<String> = None;
945 let mut current_path: Option<PathBuf> = None;
946
947 for line in content.lines() {
948 let trimmed = line.trim();
949 if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
950 if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
951 result.push((name, path));
952 }
953 let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
954 current_name = Some(name);
955 } else if let Some(rest) = trimmed.strip_prefix("path") {
956 if let Some(eq_pos) = rest.find('=') {
957 let path_str = rest[eq_pos + 1..].trim();
958 current_path = Some(PathBuf::from(path_str));
959 }
960 }
961 }
962 if let (Some(name), Some(path)) = (current_name, current_path) {
963 result.push((name, path));
964 }
965
966 result
967}
968
969fn build_submodule_summaries(
970 analyzed: &[FileRecord],
971 submodules: &[(String, PathBuf)],
972) -> Vec<SubmoduleSummary> {
973 submodules
974 .iter()
975 .map(|(name, path)| {
976 let files: Vec<&FileRecord> = analyzed
977 .iter()
978 .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
979 .collect();
980
981 let files_analyzed = files.len() as u64;
982 let total_physical_lines = files
983 .iter()
984 .map(|f| f.raw_line_categories.total_physical_lines)
985 .sum();
986 let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
987 let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
988 let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
989 let language_summaries = build_language_summaries_from_slice(&files);
990
991 SubmoduleSummary {
992 name: name.clone(),
993 relative_path: path.to_string_lossy().replace('\\', "/"),
994 files_analyzed,
995 total_physical_lines,
996 code_lines,
997 comment_lines,
998 blank_lines,
999 language_summaries,
1000 }
1001 })
1002 .filter(|s| s.files_analyzed > 0)
1003 .collect()
1004}
1005
1006fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1007 let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1008 for file in files {
1009 if let Some(lang) = file.language {
1010 let entry = map
1011 .entry(lang.display_name().to_string())
1012 .or_insert_with(|| LanguageSummary {
1013 language: lang,
1014 files: 0,
1015 total_physical_lines: 0,
1016 code_lines: 0,
1017 comment_lines: 0,
1018 blank_lines: 0,
1019 mixed_lines_separate: 0,
1020 functions: 0,
1021 classes: 0,
1022 variables: 0,
1023 imports: 0,
1024 });
1025 entry.files += 1;
1026 let r = &file.raw_line_categories;
1027 entry.total_physical_lines += r.total_physical_lines;
1028 entry.code_lines += file.effective_counts.code_lines;
1029 entry.comment_lines += file.effective_counts.comment_lines;
1030 entry.blank_lines += file.effective_counts.blank_lines;
1031 entry.mixed_lines_separate += file.effective_counts.mixed_lines_separate;
1032 }
1033 }
1034 map.into_values().collect()
1035}
1036
1037fn file_name_eq(path: &Path, expected: &str) -> bool {
1038 path.file_name()
1039 .and_then(|name| name.to_str())
1040 .is_some_and(|name| name == expected)
1041}
1042
1043fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1044 path.components().any(|component| {
1045 component
1046 .as_os_str()
1047 .to_str()
1048 .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1049 })
1050}
1051
1052fn is_vendor_path(path: &Path) -> bool {
1053 path.components().any(|component| {
1054 component
1055 .as_os_str()
1056 .to_str()
1057 .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1058 })
1059}
1060
1061fn is_known_lockfile(path: &Path) -> bool {
1062 path.file_name()
1063 .and_then(|name| name.to_str())
1064 .is_some_and(|name| {
1065 matches!(
1066 name,
1067 "Cargo.lock"
1068 | "package-lock.json"
1069 | "yarn.lock"
1070 | "pnpm-lock.yaml"
1071 | "Pipfile.lock"
1072 | "poetry.lock"
1073 | "composer.lock"
1074 )
1075 })
1076}
1077
1078fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1079 let file_name = path
1080 .file_name()
1081 .and_then(|name| name.to_str())
1082 .unwrap_or_default();
1083 if file_name.contains(".generated.") || file_name.contains(".g.") {
1084 return true;
1085 }
1086
1087 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(1024)]).to_ascii_lowercase();
1088 sample.contains("@generated") || sample.contains("generated by")
1089}
1090
1091fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1092 let file_name = path
1093 .file_name()
1094 .and_then(|name| name.to_str())
1095 .unwrap_or_default();
1096 if file_name.contains(".min.") {
1097 return true;
1098 }
1099
1100 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(4096)]);
1101 let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1102 let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1103 longest_line > 2000 && whitespace * 100 < sample.len().max(1)
1104}
1105
1106fn is_binary(bytes: &[u8]) -> bool {
1107 if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1108 || bytes.starts_with(&[0xFF, 0xFE])
1109 || bytes.starts_with(&[0xFE, 0xFF])
1110 {
1111 return false;
1112 }
1113
1114 let sample = &bytes[..bytes.len().min(8192)];
1115 sample.contains(&0)
1116}
1117
1118fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1119 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1120 let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1121 return Ok((text, "utf-8-bom".into(), vec![]));
1122 }
1123
1124 if bytes.starts_with(&[0xFF, 0xFE]) {
1125 let (cow, _, had_errors) = UTF_16LE.decode(&bytes[2..]);
1126 let mut warnings = Vec::new();
1127 if had_errors {
1128 warnings.push("utf-16le decode contained replacement characters".into());
1129 }
1130 return Ok((cow.into_owned(), "utf-16le".into(), warnings));
1131 }
1132
1133 if bytes.starts_with(&[0xFE, 0xFF]) {
1134 let (cow, _, had_errors) = UTF_16BE.decode(&bytes[2..]);
1135 let mut warnings = Vec::new();
1136 if had_errors {
1137 warnings.push("utf-16be decode contained replacement characters".into());
1138 }
1139 return Ok((cow.into_owned(), "utf-16be".into(), warnings));
1140 }
1141
1142 #[allow(clippy::option_if_let_else)]
1144 if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1145 Ok((text, "utf-8".into(), vec![]))
1146 } else {
1147 let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1148 let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1149 if had_errors {
1150 warnings.push("fallback decode contained replacement characters".into());
1151 }
1152 Ok((cow.into_owned(), "windows-1252".into(), warnings))
1153 }
1154}
1155
1156fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1157 if patterns.is_empty() {
1158 return Ok(None);
1159 }
1160
1161 let mut builder = GlobSetBuilder::new();
1162 for pattern in patterns {
1163 builder
1164 .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1165 }
1166 Ok(Some(
1167 builder.build().context("failed to compile glob filters")?,
1168 ))
1169}
1170
1171fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1172 if enabled.is_empty() {
1173 return Ok(None);
1174 }
1175
1176 let supported = supported_languages();
1177 let mut set = BTreeSet::new();
1178 for name in enabled {
1179 let language = Language::from_name(name)
1180 .with_context(|| format!("unsupported language in config: {name}"))?;
1181 if !supported.contains(&language) {
1182 anyhow::bail!("language {name} is not supported in this build");
1183 }
1184 set.insert(language);
1185 }
1186 Ok(Some(set))
1187}
1188
1189pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1193 let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1194 fs::write(output_path, json)
1195 .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1196}
1197
1198pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1202 let contents = fs::read_to_string(path)
1203 .with_context(|| format!("failed to read result file {}", path.display()))?;
1204 serde_json::from_str(&contents)
1205 .with_context(|| format!("failed to parse JSON result {}", path.display()))
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210 use super::*;
1211
1212 #[test]
1213 fn effective_counts_respect_code_only_policy() {
1214 let raw = RawLineCounts {
1215 code_only_lines: 2,
1216 single_comment_only_lines: 1,
1217 mixed_code_single_comment_lines: 3,
1218 docstring_comment_lines: 2,
1219 ..RawLineCounts::default()
1220 };
1221 let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1222 assert_eq!(counts.code_lines, 5);
1223 assert_eq!(counts.comment_lines, 3);
1224 }
1225
1226 #[test]
1227 fn effective_counts_can_separate_mixed() {
1228 let raw = RawLineCounts {
1229 mixed_code_single_comment_lines: 2,
1230 mixed_code_multi_comment_lines: 1,
1231 ..RawLineCounts::default()
1232 };
1233 let counts =
1234 compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1235 assert_eq!(counts.mixed_lines_separate, 3);
1236 assert_eq!(counts.code_lines, 0);
1237 assert_eq!(counts.comment_lines, 0);
1238 }
1239
1240 #[test]
1241 fn windows_1252_fallback_decodes() {
1242 let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1243 let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1244 assert_eq!(encoding, "windows-1252");
1245 assert!(text.contains('โ'));
1246 assert!(!warnings.is_empty());
1247 }
1248}