1#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot, WatchedDirsStore};
13
14use std::collections::{BTreeMap, BTreeSet, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18
19use anyhow::{Context, Result};
20use chrono::{DateTime, Utc};
21use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
22use globset::{Glob, GlobSet, GlobSetBuilder};
23use ignore::WalkBuilder;
24use serde::{Deserialize, Serialize};
25use uuid::Uuid;
26
27use sloc_config::{
28 AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
29 FailureBehavior, MixedLinePolicy,
30};
31use sloc_languages::{
32 analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
33 RawLineCounts,
34};
35
36const MAX_ANALYSIS_THREADS: usize = 16;
40const DEFAULT_ANALYSIS_THREADS: usize = 4;
42const GENERATED_SAMPLE_BYTES: usize = 1024;
44const MINIFIED_SAMPLE_BYTES: usize = 4096;
46const MINIFIED_LINE_THRESHOLD: usize = 2000;
48const BINARY_SAMPLE_BYTES: usize = 8192;
50
51enum MetadataPolicyOutcome {
53 Skip(Box<FileRecord>),
55 Exclude,
57 Continue,
59}
60
61#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum FileStatus {
64 AnalyzedExact,
65 AnalyzedBestEffort,
66 SkippedBinary,
67 SkippedDecodeError,
68 SkippedUnsupported,
69 SkippedByPolicy,
70 ErrorInternal,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, Default)]
74pub struct EffectiveCounts {
75 pub code_lines: u64,
76 pub comment_lines: u64,
77 pub blank_lines: u64,
78 pub mixed_lines_separate: u64,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ToolMetadata {
83 pub name: String,
84 pub version: String,
85 pub run_id: String,
86 pub timestamp_utc: DateTime<Utc>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct EnvironmentMetadata {
91 pub operating_system: String,
92 pub architecture: String,
93 pub runtime_mode: String,
94 pub initiator_username: String,
95 pub initiator_hostname: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, Default)]
99pub struct SummaryTotals {
100 pub files_considered: u64,
101 pub files_analyzed: u64,
102 pub files_skipped: u64,
103 pub total_physical_lines: u64,
104 pub code_lines: u64,
105 pub comment_lines: u64,
106 pub blank_lines: u64,
107 pub mixed_lines_separate: u64,
108 #[serde(default)]
109 pub functions: u64,
110 #[serde(default)]
111 pub classes: u64,
112 #[serde(default)]
113 pub variables: u64,
114 #[serde(default)]
115 pub imports: u64,
116 #[serde(default)]
117 pub test_count: u64,
118 #[serde(default)]
120 pub test_assertion_count: u64,
121 #[serde(default)]
123 pub test_suite_count: u64,
124 #[serde(default)]
126 pub coverage_lines_found: u64,
127 #[serde(default)]
128 pub coverage_lines_hit: u64,
129 #[serde(default)]
130 pub coverage_functions_found: u64,
131 #[serde(default)]
132 pub coverage_functions_hit: u64,
133 #[serde(default)]
134 pub coverage_branches_found: u64,
135 #[serde(default)]
136 pub coverage_branches_hit: u64,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageSummary {
141 pub language: Language,
142 pub files: u64,
143 pub total_physical_lines: u64,
144 pub code_lines: u64,
145 pub comment_lines: u64,
146 pub blank_lines: u64,
147 pub mixed_lines_separate: u64,
148 #[serde(default)]
149 pub functions: u64,
150 #[serde(default)]
151 pub classes: u64,
152 #[serde(default)]
153 pub variables: u64,
154 #[serde(default)]
155 pub imports: u64,
156 #[serde(default)]
157 pub test_count: u64,
158 #[serde(default)]
159 pub test_assertion_count: u64,
160 #[serde(default)]
161 pub test_suite_count: u64,
162 #[serde(default)]
163 pub coverage_lines_found: u64,
164 #[serde(default)]
165 pub coverage_lines_hit: u64,
166 #[serde(default)]
167 pub coverage_functions_found: u64,
168 #[serde(default)]
169 pub coverage_functions_hit: u64,
170 #[serde(default)]
171 pub coverage_branches_found: u64,
172 #[serde(default)]
173 pub coverage_branches_hit: u64,
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct FileRecord {
178 pub path: String,
179 pub relative_path: String,
180 pub language: Option<Language>,
181 pub size_bytes: u64,
182 pub detected_encoding: Option<String>,
183 pub raw_line_categories: RawLineCounts,
184 pub effective_counts: EffectiveCounts,
185 pub status: FileStatus,
186 pub warnings: Vec<String>,
187 pub generated: bool,
188 pub minified: bool,
189 pub vendor: bool,
190 pub parse_mode: Option<ParseMode>,
191 #[serde(skip_serializing_if = "Option::is_none")]
192 pub submodule: Option<String>,
193 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub coverage: Option<FileCoverage>,
196}
197
198#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct SubmoduleSummary {
201 pub name: String,
202 pub relative_path: String,
203 pub files_analyzed: u64,
204 pub total_physical_lines: u64,
205 pub code_lines: u64,
206 pub comment_lines: u64,
207 pub blank_lines: u64,
208 pub language_summaries: Vec<LanguageSummary>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct AnalysisRun {
213 pub tool: ToolMetadata,
214 pub environment: EnvironmentMetadata,
215 pub effective_configuration: AppConfig,
216 pub input_roots: Vec<String>,
217 pub summary_totals: SummaryTotals,
218 pub totals_by_language: Vec<LanguageSummary>,
219 pub per_file_records: Vec<FileRecord>,
220 pub skipped_file_records: Vec<FileRecord>,
221 pub warnings: Vec<String>,
222 #[serde(default, skip_serializing_if = "Vec::is_empty")]
224 pub submodule_summaries: Vec<SubmoduleSummary>,
225 #[serde(default, skip_serializing_if = "Option::is_none")]
227 pub git_commit_short: Option<String>,
228 #[serde(default, skip_serializing_if = "Option::is_none")]
230 pub git_commit_long: Option<String>,
231 #[serde(default, skip_serializing_if = "Option::is_none")]
233 pub git_branch: Option<String>,
234 #[serde(default, skip_serializing_if = "Option::is_none")]
236 pub git_commit_author: Option<String>,
237 #[serde(default, skip_serializing_if = "Option::is_none")]
239 pub git_tags: Option<String>,
240 #[serde(default, skip_serializing_if = "Option::is_none")]
242 pub git_nearest_tag: Option<String>,
243 #[serde(default, skip_serializing_if = "Option::is_none")]
245 pub git_commit_date: Option<String>,
246}
247
248fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
249 std::process::Command::new("git")
250 .args(args)
251 .current_dir(dir)
252 .output()
253 .ok()
254 .filter(|o| o.status.success())
255 .and_then(|o| String::from_utf8(o.stdout).ok())
256 .map(|s| s.trim().to_string())
257 .filter(|s| !s.is_empty())
258}
259
260#[derive(Default)]
261struct GitInfo {
262 commit_short: Option<String>,
263 commit_long: Option<String>,
264 branch: Option<String>,
265 author: Option<String>,
266 tags: Option<String>,
267 nearest_tag: Option<String>,
268 commit_date: Option<String>,
269}
270
271fn detect_git_for_run(project_path: &Path) -> GitInfo {
272 GitInfo {
273 commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
274 commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
275 branch: run_git_in(project_path, &["branch", "--show-current"]),
276 author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
277 tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
278 t.lines()
279 .filter(|l| !l.is_empty())
280 .collect::<Vec<_>>()
281 .join(", ")
282 }),
283 nearest_tag: run_git_in(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]),
284 commit_date: run_git_in(project_path, &["log", "--format=%aI", "-1"]),
285 }
286}
287
288fn get_current_username() -> String {
289 std::env::var("USERNAME")
290 .or_else(|_| std::env::var("USER"))
291 .unwrap_or_else(|_| "unknown".to_string())
292}
293
294fn get_hostname() -> String {
295 std::env::var("COMPUTERNAME")
296 .or_else(|_| std::env::var("HOSTNAME"))
297 .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
298 .unwrap_or_else(|_| "unknown".to_string())
299}
300
301#[allow(clippy::too_many_arguments)]
303fn walk_root(
304 root: &Path,
305 config: &AppConfig,
306 include_globs: Option<&GlobSet>,
307 exclude_globs: Option<&GlobSet>,
308 enabled_languages: Option<&BTreeSet<Language>>,
309 seen_paths: &mut HashSet<PathBuf>,
310 analyzed: &mut Vec<FileRecord>,
311 skipped: &mut Vec<FileRecord>,
312 warnings: &mut Vec<String>,
313 cancel: Option<&AtomicBool>,
314) -> Result<()> {
315 let mut builder = WalkBuilder::new(root);
316 builder
317 .follow_links(config.discovery.follow_symlinks)
318 .hidden(config.discovery.ignore_hidden_files)
319 .ignore(config.discovery.honor_ignore_files)
320 .parents(config.discovery.honor_ignore_files)
321 .git_ignore(config.discovery.honor_ignore_files)
322 .git_global(config.discovery.honor_ignore_files)
323 .git_exclude(config.discovery.honor_ignore_files);
324
325 let paths = collect_walk_paths(&builder, seen_paths, warnings);
326 if paths.is_empty() {
327 return Ok(());
328 }
329
330 let chunk_results = run_parallel_analysis(
331 &paths,
332 root,
333 config,
334 include_globs,
335 exclude_globs,
336 enabled_languages,
337 cancel,
338 )?;
339 merge_chunk_results(chunk_results, analyzed, skipped, warnings)
340}
341
342fn collect_walk_paths(
343 builder: &WalkBuilder,
344 seen_paths: &mut HashSet<PathBuf>,
345 warnings: &mut Vec<String>,
346) -> Vec<PathBuf> {
347 let mut paths = Vec::new();
348 for entry in builder.build() {
349 let entry = match entry {
350 Ok(e) => e,
351 Err(err) => {
352 warnings.push(format!("discovery warning: {err}"));
353 continue;
354 }
355 };
356 let path = entry.into_path();
357 if path.is_dir() || !seen_paths.insert(path.clone()) {
358 continue;
359 }
360 paths.push(path);
361 }
362 paths
363}
364
365#[allow(clippy::too_many_arguments)]
366fn run_parallel_analysis(
367 paths: &[PathBuf],
368 root: &Path,
369 config: &AppConfig,
370 include_globs: Option<&GlobSet>,
371 exclude_globs: Option<&GlobSet>,
372 enabled_languages: Option<&BTreeSet<Language>>,
373 cancel: Option<&AtomicBool>,
374) -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
375 let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
376 n.get().min(MAX_ANALYSIS_THREADS)
377 });
378 let chunk_size = paths.len().div_ceil(thread_count);
379 std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
380 paths
381 .chunks(chunk_size)
382 .map(|chunk| {
383 s.spawn(move || -> Vec<Result<Option<FileRecord>>> {
384 let mut results = Vec::with_capacity(chunk.len());
385 for path in chunk {
386 if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
387 results.push(Err(anyhow::anyhow!("analysis cancelled")));
388 break;
389 }
390 results.push(analyze_candidate_file(
391 path,
392 root,
393 config,
394 include_globs,
395 exclude_globs,
396 enabled_languages,
397 ));
398 }
399 results
400 })
401 })
402 .map(|h| {
403 h.join()
404 .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
405 })
406 .collect()
407 })
408}
409
410fn merge_chunk_results(
411 chunk_results: Vec<Vec<Result<Option<FileRecord>>>>,
412 analyzed: &mut Vec<FileRecord>,
413 skipped: &mut Vec<FileRecord>,
414 warnings: &mut Vec<String>,
415) -> Result<()> {
416 for chunk in chunk_results {
417 for result in chunk {
418 if let Some(record) = result? {
419 push_record(record, analyzed, skipped, warnings);
420 }
421 }
422 }
423 Ok(())
424}
425
426fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
428 let root = config.discovery.root_paths[0]
429 .canonicalize()
430 .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
431 let submodules = detect_submodules(&root);
432 if submodules.is_empty() {
433 return Vec::new();
434 }
435
436 for file in analyzed.iter_mut() {
437 for (name, sub_path) in &submodules {
438 let prefix = sub_path.to_string_lossy().replace('\\', "/");
439 let rel = &file.relative_path;
440 if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
441 file.submodule = Some(name.clone());
442 break;
443 }
444 }
445 }
446
447 build_submodule_summaries(analyzed, &submodules)
448}
449
450fn assemble_run(
452 config: &AppConfig,
453 runtime_mode: &str,
454 analyzed: Vec<FileRecord>,
455 skipped: Vec<FileRecord>,
456 warnings: Vec<String>,
457 submodule_summaries: Vec<SubmoduleSummary>,
458) -> AnalysisRun {
459 let summary = build_summary(&analyzed, &skipped);
460 let language_summaries = build_language_summaries(&analyzed);
461
462 let first_root = config
463 .discovery
464 .root_paths
465 .first()
466 .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
467 let git = first_root
468 .as_deref()
469 .map(detect_git_for_run)
470 .unwrap_or_default();
471
472 let now = Utc::now();
473 let run_id = {
474 let uuid_suffix = Uuid::new_v4().simple().to_string();
475 format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
476 };
477
478 AnalysisRun {
479 tool: ToolMetadata {
480 name: "sloc".into(),
481 version: env!("CARGO_PKG_VERSION").into(),
482 run_id,
483 timestamp_utc: now,
484 },
485 environment: EnvironmentMetadata {
486 operating_system: std::env::consts::OS.into(),
487 architecture: std::env::consts::ARCH.into(),
488 runtime_mode: runtime_mode.into(),
489 initiator_username: get_current_username(),
490 initiator_hostname: get_hostname(),
491 },
492 effective_configuration: config.clone(),
493 input_roots: config
494 .discovery
495 .root_paths
496 .iter()
497 .map(|p| path_to_string(p))
498 .collect(),
499 summary_totals: summary,
500 totals_by_language: language_summaries,
501 per_file_records: analyzed,
502 skipped_file_records: skipped,
503 warnings,
504 submodule_summaries,
505 git_commit_short: git.commit_short,
506 git_commit_long: git.commit_long,
507 git_branch: git.branch,
508 git_commit_author: git.author,
509 git_tags: git.tags,
510 git_nearest_tag: git.nearest_tag,
511 git_commit_date: git.commit_date,
512 }
513}
514
515#[allow(clippy::too_many_lines)]
520pub fn analyze(
521 config: &AppConfig,
522 runtime_mode: &str,
523 cancel: Option<&AtomicBool>,
524) -> Result<AnalysisRun> {
525 config.validate()?;
526
527 if config.discovery.root_paths.is_empty() {
528 anyhow::bail!("no input paths were provided");
529 }
530
531 let include_globs = compile_globset(&config.discovery.include_globs)?;
532 let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
533 let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
534
535 let mut analyzed = Vec::new();
536 let mut skipped = Vec::new();
537 let mut warnings = Vec::new();
538 let mut seen_paths = HashSet::new();
539
540 for root in &config.discovery.root_paths {
541 if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
542 anyhow::bail!("analysis cancelled");
543 }
544
545 let root = root.canonicalize().unwrap_or_else(|_| root.clone());
546
547 if root.is_file() {
548 if let Some(record) = analyze_candidate_file(
549 &root,
550 root.parent().unwrap_or_else(|| Path::new(".")),
551 config,
552 include_globs.as_ref(),
553 exclude_globs.as_ref(),
554 enabled_languages.as_ref(),
555 )? {
556 push_record(record, &mut analyzed, &mut skipped, &mut warnings);
557 }
558 continue;
559 }
560
561 walk_root(
562 &root,
563 config,
564 include_globs.as_ref(),
565 exclude_globs.as_ref(),
566 enabled_languages.as_ref(),
567 &mut seen_paths,
568 &mut analyzed,
569 &mut skipped,
570 &mut warnings,
571 cancel,
572 )?;
573 }
574
575 analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
576 skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
577
578 let submodule_summaries = if config.discovery.submodule_breakdown {
580 process_submodules(config, &mut analyzed)
581 } else {
582 Vec::new()
583 };
584
585 attach_coverage(config, &mut analyzed, &mut warnings);
586
587 Ok(assemble_run(
588 config,
589 runtime_mode,
590 analyzed,
591 skipped,
592 warnings,
593 submodule_summaries,
594 ))
595}
596
597fn attach_coverage(config: &AppConfig, analyzed: &mut [FileRecord], warnings: &mut Vec<String>) {
598 let Some(cov_path) = coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
599 else {
600 return;
601 };
602 match fs::read_to_string(&cov_path) {
603 Ok(content) => {
604 let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
605 for record in analyzed.iter_mut() {
606 record.coverage =
607 coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
608 }
609 }
610 Err(e) => {
611 warnings.push(format!(
612 "coverage file '{}' could not be read: {e}",
613 cov_path.display()
614 ));
615 }
616 }
617}
618
619fn push_record(
620 record: FileRecord,
621 analyzed: &mut Vec<FileRecord>,
622 skipped: &mut Vec<FileRecord>,
623 warnings: &mut Vec<String>,
624) {
625 warnings.extend(
626 record
627 .warnings
628 .iter()
629 .map(|warning| format!("{}: {warning}", record.relative_path)),
630 );
631
632 match record.status {
633 FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
634 _ => skipped.push(record),
635 }
636}
637
638#[inline]
640fn skip_with_reason(
641 path: &Path,
642 root: &Path,
643 size: u64,
644 reason: impl Into<String>,
645) -> MetadataPolicyOutcome {
646 MetadataPolicyOutcome::Skip(Box::new(skipped_record(
647 path,
648 root,
649 size,
650 FileStatus::SkippedByPolicy,
651 vec![reason.into()],
652 )))
653}
654
655#[allow(clippy::too_many_arguments)]
659fn check_metadata_policy(
660 path: &Path,
661 root: &Path,
662 relative_path: &str,
663 metadata: &fs::Metadata,
664 config: &AppConfig,
665 include_globs: Option<&GlobSet>,
666 exclude_globs: Option<&GlobSet>,
667) -> MetadataPolicyOutcome {
668 let size = metadata.len();
669
670 if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
671 return skip_with_reason(path, root, size, "symlink skipped by policy");
672 }
673 if file_name_eq(path, ".gitignore") {
674 return skip_with_reason(path, root, size, ".gitignore is always excluded");
675 }
676 if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
677 return skip_with_reason(path, root, size, "path matched excluded directory setting");
678 }
679 if size > config.discovery.max_file_size_bytes {
680 return skip_with_reason(
681 path,
682 root,
683 size,
684 format!(
685 "file exceeded max_file_size_bytes ({})",
686 config.discovery.max_file_size_bytes
687 ),
688 );
689 }
690 if let Some(globs) = include_globs {
691 if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
692 return MetadataPolicyOutcome::Exclude;
693 }
694 }
695 if let Some(globs) = exclude_globs {
696 if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
697 return skip_with_reason(path, root, size, "path matched exclude glob");
698 }
699 }
700 if is_known_lockfile(path) && !config.analysis.include_lockfiles {
701 return skip_with_reason(path, root, size, "lockfile skipped by default policy");
702 }
703
704 MetadataPolicyOutcome::Continue
705}
706
707struct ContentPolicyResult {
708 vendor: bool,
709 generated: bool,
710 minified: bool,
711 skip_record: Option<FileRecord>,
712}
713
714fn check_content_policy(
717 path: &Path,
718 root: &Path,
719 size_bytes: u64,
720 bytes: &[u8],
721 config: &AppConfig,
722) -> ContentPolicyResult {
723 let vendor = is_vendor_path(path);
724 if vendor && config.analysis.vendor_directory_detection {
725 return ContentPolicyResult {
726 vendor,
727 generated: false,
728 minified: false,
729 skip_record: Some(skipped_record(
730 path,
731 root,
732 size_bytes,
733 FileStatus::SkippedByPolicy,
734 vec!["vendor file skipped by policy".into()],
735 )),
736 };
737 }
738
739 let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
740 if generated {
741 return ContentPolicyResult {
742 vendor,
743 generated,
744 minified: false,
745 skip_record: Some(skipped_record(
746 path,
747 root,
748 size_bytes,
749 FileStatus::SkippedByPolicy,
750 vec!["generated file skipped by policy".into()],
751 )),
752 };
753 }
754
755 let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
756 if minified {
757 return ContentPolicyResult {
758 vendor,
759 generated,
760 minified,
761 skip_record: Some(skipped_record(
762 path,
763 root,
764 size_bytes,
765 FileStatus::SkippedByPolicy,
766 vec!["minified file skipped by policy".into()],
767 )),
768 };
769 }
770
771 ContentPolicyResult {
772 vendor,
773 generated,
774 minified,
775 skip_record: None,
776 }
777}
778
779fn decode_file_contents(
781 path: &Path,
782 root: &Path,
783 size_bytes: u64,
784 bytes: &[u8],
785 config: &AppConfig,
786) -> Result<Option<(String, String, Vec<String>)>> {
787 if is_binary(bytes) {
788 return match config.analysis.binary_file_behavior {
789 BinaryFileBehavior::Skip => Ok(None),
790 BinaryFileBehavior::Fail => {
791 anyhow::bail!("binary file encountered: {}", path.display())
792 }
793 };
794 }
795
796 match decode_bytes(bytes) {
797 Ok(result) => Ok(Some(result)),
798 Err(err) => match config.analysis.decode_failure_behavior {
799 FailureBehavior::WarnSkip => {
800 let _ = (path, root, size_bytes); Err(anyhow::anyhow!("__decode_warn__: {err}"))
805 }
806 FailureBehavior::Fail => {
807 anyhow::bail!("decode failure for {}: {err}", path.display())
808 }
809 },
810 }
811}
812
813#[allow(clippy::too_many_lines)]
814fn analyze_candidate_file(
815 path: &Path,
816 root: &Path,
817 config: &AppConfig,
818 include_globs: Option<&GlobSet>,
819 exclude_globs: Option<&GlobSet>,
820 enabled_languages: Option<&BTreeSet<Language>>,
821) -> Result<Option<FileRecord>> {
822 let metadata = match fs::symlink_metadata(path) {
823 Ok(metadata) => metadata,
824 Err(err) => {
825 return Ok(Some(skipped_record(
826 path,
827 root,
828 0,
829 FileStatus::ErrorInternal,
830 vec![format!("failed to read metadata: {err}")],
831 )));
832 }
833 };
834
835 let relative_path = relative_path_string(path, root);
836
837 match check_metadata_policy(
839 path,
840 root,
841 &relative_path,
842 &metadata,
843 config,
844 include_globs,
845 exclude_globs,
846 ) {
847 MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
848 MetadataPolicyOutcome::Exclude => return Ok(None),
849 MetadataPolicyOutcome::Continue => {}
850 }
851
852 let bytes = match fs::read(path) {
853 Ok(bytes) => bytes,
854 Err(err) => {
855 return Ok(Some(skipped_record(
856 path,
857 root,
858 metadata.len(),
859 FileStatus::ErrorInternal,
860 vec![format!("failed to read file: {err}")],
861 )));
862 }
863 };
864
865 let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
867 if let Some(record) = content_policy.skip_record {
868 return Ok(Some(record));
869 }
870 let (vendor, generated, minified) = (
871 content_policy.vendor,
872 content_policy.generated,
873 content_policy.minified,
874 );
875
876 let (text, encoding, decode_warnings) =
878 match decode_file_contents(path, root, metadata.len(), &bytes, config) {
879 Ok(Some(result)) => result,
880 Ok(None) => {
881 return Ok(Some(skipped_record(
882 path,
883 root,
884 metadata.len(),
885 FileStatus::SkippedBinary,
886 vec!["binary file skipped by default".into()],
887 )));
888 }
889 Err(err) => {
890 let msg = err.to_string();
891 if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
892 return Ok(Some(skipped_record(
893 path,
894 root,
895 metadata.len(),
896 FileStatus::SkippedDecodeError,
897 vec![warn_msg.to_string()],
898 )));
899 }
900 return Err(err);
901 }
902 };
903
904 let first_line = text.lines().next();
905 let language = detect_language(
906 path,
907 first_line,
908 &config.analysis.extension_overrides,
909 config.analysis.shebang_detection,
910 );
911
912 let Some(language) = language else {
913 return Ok(Some(skipped_record(
914 path,
915 root,
916 metadata.len(),
917 FileStatus::SkippedUnsupported,
918 vec!["unsupported or undetected language".into()],
919 )));
920 };
921
922 if let Some(enabled) = enabled_languages {
923 if !enabled.contains(&language) {
924 return Ok(Some(skipped_record(
925 path,
926 root,
927 metadata.len(),
928 FileStatus::SkippedByPolicy,
929 vec![format!(
930 "language {} disabled by configuration",
931 language.display_name()
932 )],
933 )));
934 }
935 }
936
937 let ieee_opts = AnalysisOptions {
938 blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
939 == BlankInBlockCommentPolicy::CountAsComment,
940 collapse_continuation_lines: config.analysis.continuation_line_policy
941 == ContinuationLinePolicy::CollapseToLogical,
942 };
943 let analysis = analyze_text(language, &text, ieee_opts);
944 let effective_counts = compute_effective_counts(
945 &analysis.raw,
946 config.analysis.mixed_line_policy,
947 config.analysis.python_docstrings_as_comments,
948 config.analysis.count_compiler_directives,
949 );
950
951 let mut warnings = decode_warnings;
952 warnings.extend(analysis.warnings.clone());
953
954 Ok(Some(FileRecord {
955 path: path_to_string(path),
956 relative_path,
957 language: Some(language),
958 size_bytes: metadata.len(),
959 detected_encoding: Some(encoding),
960 raw_line_categories: analysis.raw,
961 effective_counts,
962 status: match analysis.parse_mode {
963 ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
964 ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
965 },
966 warnings,
967 generated,
968 minified,
969 vendor,
970 parse_mode: Some(analysis.parse_mode),
971 submodule: None,
972 coverage: None,
973 }))
974}
975
976const fn compute_effective_counts(
977 raw: &RawLineCounts,
978 mixed_line_policy: MixedLinePolicy,
979 python_docstrings_as_comments: bool,
980 count_compiler_directives: bool,
981) -> EffectiveCounts {
982 let mut effective = EffectiveCounts {
983 code_lines: raw.code_only_lines,
984 comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
985 blank_lines: raw.blank_only_lines,
986 mixed_lines_separate: 0,
987 };
988
989 if python_docstrings_as_comments {
990 effective.comment_lines += raw.docstring_comment_lines;
991 } else {
992 effective.code_lines += raw.docstring_comment_lines;
993 }
994
995 let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
996 match mixed_line_policy {
997 MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
998 MixedLinePolicy::CodeAndComment => {
999 effective.code_lines += mixed_total;
1000 effective.comment_lines += mixed_total;
1001 }
1002 MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
1003 MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
1004 }
1005
1006 if !count_compiler_directives {
1009 effective.code_lines = effective
1010 .code_lines
1011 .saturating_sub(raw.compiler_directive_lines);
1012 }
1013
1014 effective
1015}
1016
1017fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
1018 let mut summary = SummaryTotals {
1019 files_considered: (analyzed.len() + skipped.len()) as u64,
1020 files_analyzed: analyzed.len() as u64,
1021 files_skipped: skipped.len() as u64,
1022 ..Default::default()
1023 };
1024
1025 for record in analyzed {
1026 summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
1027 summary.code_lines += record.effective_counts.code_lines;
1028 summary.comment_lines += record.effective_counts.comment_lines;
1029 summary.blank_lines += record.effective_counts.blank_lines;
1030 summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1031 summary.functions += record.raw_line_categories.functions;
1032 summary.classes += record.raw_line_categories.classes;
1033 summary.variables += record.raw_line_categories.variables;
1034 summary.imports += record.raw_line_categories.imports;
1035 summary.test_count += record.raw_line_categories.test_count;
1036 summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1037 summary.test_suite_count += record.raw_line_categories.test_suite_count;
1038 if let Some(cov) = &record.coverage {
1039 summary.coverage_lines_found += u64::from(cov.lines_found);
1040 summary.coverage_lines_hit += u64::from(cov.lines_hit);
1041 summary.coverage_functions_found += u64::from(cov.functions_found);
1042 summary.coverage_functions_hit += u64::from(cov.functions_hit);
1043 summary.coverage_branches_found += u64::from(cov.branches_found);
1044 summary.coverage_branches_hit += u64::from(cov.branches_hit);
1045 }
1046 }
1047
1048 summary
1049}
1050
1051const fn zeroed_summary(language: Language) -> LanguageSummary {
1053 LanguageSummary {
1054 language,
1055 files: 0,
1056 total_physical_lines: 0,
1057 code_lines: 0,
1058 comment_lines: 0,
1059 blank_lines: 0,
1060 mixed_lines_separate: 0,
1061 functions: 0,
1062 classes: 0,
1063 variables: 0,
1064 imports: 0,
1065 test_count: 0,
1066 test_assertion_count: 0,
1067 test_suite_count: 0,
1068 coverage_lines_found: 0,
1069 coverage_lines_hit: 0,
1070 coverage_functions_found: 0,
1071 coverage_functions_hit: 0,
1072 coverage_branches_found: 0,
1073 coverage_branches_hit: 0,
1074 }
1075}
1076
1077fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1079 entry.files += 1;
1080 let r = &record.raw_line_categories;
1081 entry.total_physical_lines += r.total_physical_lines;
1082 entry.code_lines += record.effective_counts.code_lines;
1083 entry.comment_lines += record.effective_counts.comment_lines;
1084 entry.blank_lines += record.effective_counts.blank_lines;
1085 entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1086 entry.functions += r.functions;
1087 entry.classes += r.classes;
1088 entry.variables += r.variables;
1089 entry.imports += r.imports;
1090 entry.test_count += r.test_count;
1091 entry.test_assertion_count += r.test_assertion_count;
1092 entry.test_suite_count += r.test_suite_count;
1093 if let Some(cov) = &record.coverage {
1094 entry.coverage_lines_found += u64::from(cov.lines_found);
1095 entry.coverage_lines_hit += u64::from(cov.lines_hit);
1096 entry.coverage_functions_found += u64::from(cov.functions_found);
1097 entry.coverage_functions_hit += u64::from(cov.functions_hit);
1098 entry.coverage_branches_found += u64::from(cov.branches_found);
1099 entry.coverage_branches_hit += u64::from(cov.branches_hit);
1100 }
1101}
1102
1103fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1104 let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1105 for record in analyzed {
1106 let Some(language) = record.language else {
1107 continue;
1108 };
1109 let entry = by_language
1110 .entry(language)
1111 .or_insert_with(|| zeroed_summary(language));
1112 accumulate_record_into_summary(entry, record);
1113 }
1114 by_language.into_values().collect()
1115}
1116
1117fn skipped_record(
1118 path: &Path,
1119 root: &Path,
1120 size_bytes: u64,
1121 status: FileStatus,
1122 warnings: Vec<String>,
1123) -> FileRecord {
1124 FileRecord {
1125 path: path_to_string(path),
1126 relative_path: relative_path_string(path, root),
1127 language: None,
1128 size_bytes,
1129 detected_encoding: None,
1130 raw_line_categories: RawLineCounts::default(),
1131 effective_counts: EffectiveCounts::default(),
1132 status,
1133 warnings,
1134 generated: false,
1135 minified: false,
1136 vendor: false,
1137 parse_mode: None,
1138 submodule: None,
1139 coverage: None,
1140 }
1141}
1142
1143fn relative_path_string(path: &Path, root: &Path) -> String {
1144 path.strip_prefix(root)
1145 .unwrap_or(path)
1146 .to_string_lossy()
1147 .replace('\\', "/")
1148}
1149
1150fn path_to_string(path: &Path) -> String {
1151 path.to_string_lossy().replace('\\', "/")
1152}
1153
1154#[must_use]
1156pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1157 let gitmodules = root.join(".gitmodules");
1158 if !gitmodules.is_file() {
1159 return Vec::new();
1160 }
1161 let Ok(content) = fs::read_to_string(&gitmodules) else {
1162 return Vec::new();
1163 };
1164
1165 let mut result = Vec::new();
1166 let mut current_name: Option<String> = None;
1167 let mut current_path: Option<PathBuf> = None;
1168
1169 for line in content.lines() {
1170 let trimmed = line.trim();
1171 if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1172 if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1173 result.push((name, path));
1174 }
1175 let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1176 current_name = Some(name);
1177 } else if let Some(rest) = trimmed.strip_prefix("path") {
1178 if let Some(eq_pos) = rest.find('=') {
1179 let path_str = rest[eq_pos + 1..].trim();
1180 current_path = Some(PathBuf::from(path_str));
1181 }
1182 }
1183 }
1184 if let (Some(name), Some(path)) = (current_name, current_path) {
1185 result.push((name, path));
1186 }
1187
1188 result
1189}
1190
1191fn build_submodule_summaries(
1192 analyzed: &[FileRecord],
1193 submodules: &[(String, PathBuf)],
1194) -> Vec<SubmoduleSummary> {
1195 submodules
1196 .iter()
1197 .map(|(name, path)| {
1198 let files: Vec<&FileRecord> = analyzed
1199 .iter()
1200 .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1201 .collect();
1202
1203 let files_analyzed = files.len() as u64;
1204 let total_physical_lines = files
1205 .iter()
1206 .map(|f| f.raw_line_categories.total_physical_lines)
1207 .sum();
1208 let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1209 let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1210 let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1211 let language_summaries = build_language_summaries_from_slice(&files);
1212
1213 SubmoduleSummary {
1214 name: name.clone(),
1215 relative_path: path.to_string_lossy().replace('\\', "/"),
1216 files_analyzed,
1217 total_physical_lines,
1218 code_lines,
1219 comment_lines,
1220 blank_lines,
1221 language_summaries,
1222 }
1223 })
1224 .filter(|s| s.files_analyzed > 0)
1225 .collect()
1226}
1227
1228fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1229 let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1230 for file in files {
1231 let Some(lang) = file.language else { continue };
1232 let entry = map
1233 .entry(lang.display_name().to_string())
1234 .or_insert_with(|| zeroed_summary(lang));
1235 accumulate_record_into_summary(entry, file);
1236 }
1237 map.into_values().collect()
1238}
1239
1240fn file_name_eq(path: &Path, expected: &str) -> bool {
1241 path.file_name()
1242 .and_then(|name| name.to_str())
1243 .is_some_and(|name| name == expected)
1244}
1245
1246fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1247 path.components().any(|component| {
1248 component
1249 .as_os_str()
1250 .to_str()
1251 .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1252 })
1253}
1254
1255fn is_vendor_path(path: &Path) -> bool {
1256 path.components().any(|component| {
1257 component
1258 .as_os_str()
1259 .to_str()
1260 .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1261 })
1262}
1263
1264fn is_known_lockfile(path: &Path) -> bool {
1265 path.file_name()
1266 .and_then(|name| name.to_str())
1267 .is_some_and(|name| {
1268 matches!(
1269 name,
1270 "Cargo.lock"
1271 | "package-lock.json"
1272 | "yarn.lock"
1273 | "pnpm-lock.yaml"
1274 | "Pipfile.lock"
1275 | "poetry.lock"
1276 | "composer.lock"
1277 )
1278 })
1279}
1280
1281fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1282 let file_name = path
1283 .file_name()
1284 .and_then(|name| name.to_str())
1285 .unwrap_or_default();
1286 if file_name.contains(".generated.") || file_name.contains(".g.") {
1287 return true;
1288 }
1289
1290 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1291 .to_ascii_lowercase();
1292 sample.contains("@generated") || sample.contains("generated by")
1293}
1294
1295fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1296 let file_name = path
1297 .file_name()
1298 .and_then(|name| name.to_str())
1299 .unwrap_or_default();
1300 if file_name.contains(".min.") {
1301 return true;
1302 }
1303
1304 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1305 let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1306 let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1307 longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1308}
1309
1310fn is_binary(bytes: &[u8]) -> bool {
1311 if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1312 || bytes.starts_with(&[0xFF, 0xFE])
1313 || bytes.starts_with(&[0xFE, 0xFF])
1314 {
1315 return false;
1316 }
1317
1318 let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1319 sample.contains(&0)
1320}
1321
1322fn decode_utf16_bom(
1325 bom_stripped: &[u8],
1326 encoding: &'static encoding_rs::Encoding,
1327 label: &str,
1328) -> (String, String, Vec<String>) {
1329 let (cow, _, had_errors) = encoding.decode(bom_stripped);
1330 let mut warnings = Vec::new();
1331 if had_errors {
1332 warnings.push(format!("{label} decode contained replacement characters"));
1333 }
1334 (cow.into_owned(), label.into(), warnings)
1335}
1336
1337fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1338 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1339 let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1340 return Ok((text, "utf-8-bom".into(), vec![]));
1341 }
1342 if bytes.starts_with(&[0xFF, 0xFE]) {
1343 return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1344 }
1345 if bytes.starts_with(&[0xFE, 0xFF]) {
1346 return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1347 }
1348
1349 #[allow(clippy::option_if_let_else)]
1351 if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1352 Ok((text, "utf-8".into(), vec![]))
1353 } else {
1354 let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1355 let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1356 if had_errors {
1357 warnings.push("fallback decode contained replacement characters".into());
1358 }
1359 Ok((cow.into_owned(), "windows-1252".into(), warnings))
1360 }
1361}
1362
1363fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1364 if patterns.is_empty() {
1365 return Ok(None);
1366 }
1367
1368 let mut builder = GlobSetBuilder::new();
1369 for pattern in patterns {
1370 builder
1371 .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1372 }
1373 Ok(Some(
1374 builder.build().context("failed to compile glob filters")?,
1375 ))
1376}
1377
1378fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1379 if enabled.is_empty() {
1380 return Ok(None);
1381 }
1382
1383 let supported = supported_languages();
1384 let mut set = BTreeSet::new();
1385 for name in enabled {
1386 let language = Language::from_name(name)
1387 .with_context(|| format!("unsupported language in config: {name}"))?;
1388 if !supported.contains(&language) {
1389 anyhow::bail!("language {name} is not supported in this build");
1390 }
1391 set.insert(language);
1392 }
1393 Ok(Some(set))
1394}
1395
1396pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1400 let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1401 fs::write(output_path, json)
1402 .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1403}
1404
1405pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1409 let contents = fs::read_to_string(path)
1410 .with_context(|| format!("failed to read result file {}", path.display()))?;
1411 serde_json::from_str(&contents)
1412 .with_context(|| format!("failed to parse JSON result {}", path.display()))
1413}
1414
1415#[cfg(test)]
1416mod tests {
1417 use super::*;
1418
1419 #[test]
1420 fn effective_counts_respect_code_only_policy() {
1421 let raw = RawLineCounts {
1422 code_only_lines: 2,
1423 single_comment_only_lines: 1,
1424 mixed_code_single_comment_lines: 3,
1425 docstring_comment_lines: 2,
1426 ..RawLineCounts::default()
1427 };
1428 let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1429 assert_eq!(counts.code_lines, 5);
1430 assert_eq!(counts.comment_lines, 3);
1431 }
1432
1433 #[test]
1434 fn effective_counts_can_separate_mixed() {
1435 let raw = RawLineCounts {
1436 mixed_code_single_comment_lines: 2,
1437 mixed_code_multi_comment_lines: 1,
1438 ..RawLineCounts::default()
1439 };
1440 let counts =
1441 compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1442 assert_eq!(counts.mixed_lines_separate, 3);
1443 assert_eq!(counts.code_lines, 0);
1444 assert_eq!(counts.comment_lines, 0);
1445 }
1446
1447 #[test]
1448 fn windows_1252_fallback_decodes() {
1449 let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1450 let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1451 assert_eq!(encoding, "windows-1252");
1452 assert!(text.contains('–'));
1453 assert!(!warnings.is_empty());
1454 }
1455}