1#![allow(clippy::multiple_crate_versions)]
4
5pub mod baseline;
6pub mod coverage;
7pub mod delta;
8pub mod history;
9pub use baseline::{check_against_baseline, resolve_baselines_path, BaselineEntry, BaselineStore};
10pub use coverage::{aggregate_line_coverage, lookup_coverage, parse_lcov, FileCoverage};
11pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
12pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot, WatchedDirsStore};
13
14use std::collections::{BTreeMap, BTreeSet, HashSet};
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18
19use anyhow::{Context, Result};
20use chrono::{DateTime, Utc};
21use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
22use globset::{Glob, GlobSet, GlobSetBuilder};
23use ignore::WalkBuilder;
24use serde::{Deserialize, Serialize};
25use uuid::Uuid;
26
27use sloc_config::{
28 AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
29 FailureBehavior, MixedLinePolicy,
30};
31use sloc_languages::{
32 analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
33 RawLineCounts,
34};
35
36const MAX_ANALYSIS_THREADS: usize = 16;
40const DEFAULT_ANALYSIS_THREADS: usize = 4;
42const GENERATED_SAMPLE_BYTES: usize = 1024;
44const MINIFIED_SAMPLE_BYTES: usize = 4096;
46const MINIFIED_LINE_THRESHOLD: usize = 2000;
48const BINARY_SAMPLE_BYTES: usize = 8192;
50
51enum MetadataPolicyOutcome {
53 Skip(Box<FileRecord>),
55 Exclude,
57 Continue,
59}
60
61#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum FileStatus {
64 AnalyzedExact,
65 AnalyzedBestEffort,
66 SkippedBinary,
67 SkippedDecodeError,
68 SkippedUnsupported,
69 SkippedByPolicy,
70 ErrorInternal,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, Default)]
74pub struct EffectiveCounts {
75 pub code_lines: u64,
76 pub comment_lines: u64,
77 pub blank_lines: u64,
78 pub mixed_lines_separate: u64,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ToolMetadata {
83 pub name: String,
84 pub version: String,
85 pub run_id: String,
86 pub timestamp_utc: DateTime<Utc>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct EnvironmentMetadata {
91 pub operating_system: String,
92 pub architecture: String,
93 pub runtime_mode: String,
94 pub initiator_username: String,
95 pub initiator_hostname: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, Default)]
99pub struct SummaryTotals {
100 pub files_considered: u64,
101 pub files_analyzed: u64,
102 pub files_skipped: u64,
103 pub total_physical_lines: u64,
104 pub code_lines: u64,
105 pub comment_lines: u64,
106 pub blank_lines: u64,
107 pub mixed_lines_separate: u64,
108 #[serde(default)]
109 pub functions: u64,
110 #[serde(default)]
111 pub classes: u64,
112 #[serde(default)]
113 pub variables: u64,
114 #[serde(default)]
115 pub imports: u64,
116 #[serde(default)]
117 pub test_count: u64,
118 #[serde(default)]
120 pub test_assertion_count: u64,
121 #[serde(default)]
123 pub test_suite_count: u64,
124 #[serde(default)]
126 pub coverage_lines_found: u64,
127 #[serde(default)]
128 pub coverage_lines_hit: u64,
129 #[serde(default)]
130 pub coverage_functions_found: u64,
131 #[serde(default)]
132 pub coverage_functions_hit: u64,
133 #[serde(default)]
134 pub coverage_branches_found: u64,
135 #[serde(default)]
136 pub coverage_branches_hit: u64,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LanguageSummary {
141 pub language: Language,
142 pub files: u64,
143 pub total_physical_lines: u64,
144 pub code_lines: u64,
145 pub comment_lines: u64,
146 pub blank_lines: u64,
147 pub mixed_lines_separate: u64,
148 #[serde(default)]
149 pub functions: u64,
150 #[serde(default)]
151 pub classes: u64,
152 #[serde(default)]
153 pub variables: u64,
154 #[serde(default)]
155 pub imports: u64,
156 #[serde(default)]
157 pub test_count: u64,
158 #[serde(default)]
159 pub test_assertion_count: u64,
160 #[serde(default)]
161 pub test_suite_count: u64,
162 #[serde(default)]
163 pub coverage_lines_found: u64,
164 #[serde(default)]
165 pub coverage_lines_hit: u64,
166 #[serde(default)]
167 pub coverage_functions_found: u64,
168 #[serde(default)]
169 pub coverage_functions_hit: u64,
170 #[serde(default)]
171 pub coverage_branches_found: u64,
172 #[serde(default)]
173 pub coverage_branches_hit: u64,
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct FileRecord {
178 pub path: String,
179 pub relative_path: String,
180 pub language: Option<Language>,
181 pub size_bytes: u64,
182 pub detected_encoding: Option<String>,
183 pub raw_line_categories: RawLineCounts,
184 pub effective_counts: EffectiveCounts,
185 pub status: FileStatus,
186 pub warnings: Vec<String>,
187 pub generated: bool,
188 pub minified: bool,
189 pub vendor: bool,
190 pub parse_mode: Option<ParseMode>,
191 #[serde(skip_serializing_if = "Option::is_none")]
192 pub submodule: Option<String>,
193 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub coverage: Option<FileCoverage>,
196}
197
198#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct SubmoduleSummary {
201 pub name: String,
202 pub relative_path: String,
203 pub files_analyzed: u64,
204 pub total_physical_lines: u64,
205 pub code_lines: u64,
206 pub comment_lines: u64,
207 pub blank_lines: u64,
208 pub language_summaries: Vec<LanguageSummary>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
212pub struct AnalysisRun {
213 pub tool: ToolMetadata,
214 pub environment: EnvironmentMetadata,
215 pub effective_configuration: AppConfig,
216 pub input_roots: Vec<String>,
217 pub summary_totals: SummaryTotals,
218 pub totals_by_language: Vec<LanguageSummary>,
219 pub per_file_records: Vec<FileRecord>,
220 pub skipped_file_records: Vec<FileRecord>,
221 pub warnings: Vec<String>,
222 #[serde(default, skip_serializing_if = "Vec::is_empty")]
224 pub submodule_summaries: Vec<SubmoduleSummary>,
225 #[serde(default, skip_serializing_if = "Option::is_none")]
227 pub git_commit_short: Option<String>,
228 #[serde(default, skip_serializing_if = "Option::is_none")]
230 pub git_commit_long: Option<String>,
231 #[serde(default, skip_serializing_if = "Option::is_none")]
233 pub git_branch: Option<String>,
234 #[serde(default, skip_serializing_if = "Option::is_none")]
236 pub git_commit_author: Option<String>,
237 #[serde(default, skip_serializing_if = "Option::is_none")]
239 pub git_tags: Option<String>,
240 #[serde(default, skip_serializing_if = "Option::is_none")]
242 pub git_nearest_tag: Option<String>,
243 #[serde(default, skip_serializing_if = "Option::is_none")]
245 pub git_commit_date: Option<String>,
246}
247
248fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
249 std::process::Command::new("git")
250 .args(args)
251 .current_dir(dir)
252 .output()
253 .ok()
254 .filter(|o| o.status.success())
255 .and_then(|o| String::from_utf8(o.stdout).ok())
256 .map(|s| s.trim().to_string())
257 .filter(|s| !s.is_empty())
258}
259
260#[derive(Default)]
261struct GitInfo {
262 commit_short: Option<String>,
263 commit_long: Option<String>,
264 branch: Option<String>,
265 author: Option<String>,
266 tags: Option<String>,
267 nearest_tag: Option<String>,
268 commit_date: Option<String>,
269}
270
271fn detect_git_for_run(project_path: &Path) -> GitInfo {
272 GitInfo {
273 commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
274 commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
275 branch: run_git_in(project_path, &["branch", "--show-current"]),
276 author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
277 tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
278 t.lines()
279 .filter(|l| !l.is_empty())
280 .collect::<Vec<_>>()
281 .join(", ")
282 }),
283 nearest_tag: run_git_in(project_path, &["describe", "--tags", "--abbrev=0", "HEAD"]),
284 commit_date: run_git_in(project_path, &["log", "--format=%aI", "-1"]),
285 }
286}
287
288fn get_current_username() -> String {
289 std::env::var("USERNAME")
290 .or_else(|_| std::env::var("USER"))
291 .unwrap_or_else(|_| "unknown".to_string())
292}
293
294fn get_hostname() -> String {
295 std::env::var("COMPUTERNAME")
296 .or_else(|_| std::env::var("HOSTNAME"))
297 .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
298 .unwrap_or_else(|_| "unknown".to_string())
299}
300
301#[allow(clippy::too_many_arguments)]
303fn walk_root(
304 root: &Path,
306 config: &AppConfig,
307 include_globs: Option<&GlobSet>,
308 exclude_globs: Option<&GlobSet>,
309 enabled_languages: Option<&BTreeSet<Language>>,
310 seen_paths: &mut HashSet<PathBuf>,
311 analyzed: &mut Vec<FileRecord>,
312 skipped: &mut Vec<FileRecord>,
313 warnings: &mut Vec<String>,
314 cancel: Option<&AtomicBool>,
315) -> Result<()> {
316 let mut builder = WalkBuilder::new(root);
317 builder
318 .follow_links(config.discovery.follow_symlinks)
319 .hidden(config.discovery.ignore_hidden_files)
320 .ignore(config.discovery.honor_ignore_files)
321 .parents(config.discovery.honor_ignore_files)
322 .git_ignore(config.discovery.honor_ignore_files)
323 .git_global(config.discovery.honor_ignore_files)
324 .git_exclude(config.discovery.honor_ignore_files);
325
326 let mut paths = Vec::new();
328 for entry in builder.build() {
329 let entry = match entry {
330 Ok(entry) => entry,
331 Err(err) => {
332 warnings.push(format!("discovery warning: {err}"));
333 continue;
334 }
335 };
336 let path = entry.into_path();
337 if path.is_dir() || !seen_paths.insert(path.clone()) {
338 continue;
339 }
340 paths.push(path);
341 }
342
343 if paths.is_empty() {
344 return Ok(());
345 }
346
347 let thread_count = std::thread::available_parallelism().map_or(DEFAULT_ANALYSIS_THREADS, |n| {
350 n.get().min(MAX_ANALYSIS_THREADS)
351 });
352 let chunk_size = paths.len().div_ceil(thread_count);
353
354 let chunk_results: Vec<Vec<Result<Option<FileRecord>>>> =
355 std::thread::scope(|s| -> Result<Vec<Vec<Result<Option<FileRecord>>>>> {
356 paths
357 .chunks(chunk_size)
358 .map(|chunk| {
359 s.spawn(move || -> Vec<Result<Option<FileRecord>>> {
360 let mut results = Vec::with_capacity(chunk.len());
361 for path in chunk {
362 if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
363 results.push(Err(anyhow::anyhow!("analysis cancelled")));
364 break;
365 }
366 results.push(analyze_candidate_file(
367 path,
368 root,
369 config,
370 include_globs,
371 exclude_globs,
372 enabled_languages,
373 ));
374 }
375 results
376 })
377 })
378 .map(|h| {
379 h.join()
380 .map_err(|_| anyhow::anyhow!("analysis thread panicked"))
381 })
382 .collect()
383 })?;
384
385 for chunk in chunk_results {
386 for result in chunk {
387 if let Some(record) = result? {
388 push_record(record, analyzed, skipped, warnings);
389 }
390 }
391 }
392
393 Ok(())
394}
395
396fn process_submodules(config: &AppConfig, analyzed: &mut [FileRecord]) -> Vec<SubmoduleSummary> {
398 let root = config.discovery.root_paths[0]
399 .canonicalize()
400 .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
401 let submodules = detect_submodules(&root);
402 if submodules.is_empty() {
403 return Vec::new();
404 }
405
406 for file in analyzed.iter_mut() {
407 for (name, sub_path) in &submodules {
408 let prefix = sub_path.to_string_lossy().replace('\\', "/");
409 let rel = &file.relative_path;
410 if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
411 file.submodule = Some(name.clone());
412 break;
413 }
414 }
415 }
416
417 build_submodule_summaries(analyzed, &submodules)
418}
419
420fn assemble_run(
422 config: &AppConfig,
423 runtime_mode: &str,
424 analyzed: Vec<FileRecord>,
425 skipped: Vec<FileRecord>,
426 warnings: Vec<String>,
427 submodule_summaries: Vec<SubmoduleSummary>,
428) -> AnalysisRun {
429 let summary = build_summary(&analyzed, &skipped);
430 let language_summaries = build_language_summaries(&analyzed);
431
432 let first_root = config
433 .discovery
434 .root_paths
435 .first()
436 .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()));
437 let git = first_root
438 .as_deref()
439 .map(detect_git_for_run)
440 .unwrap_or_default();
441
442 let now = Utc::now();
443 let run_id = {
444 let uuid_suffix = Uuid::new_v4().simple().to_string();
445 format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
446 };
447
448 AnalysisRun {
449 tool: ToolMetadata {
450 name: "sloc".into(),
451 version: env!("CARGO_PKG_VERSION").into(),
452 run_id,
453 timestamp_utc: now,
454 },
455 environment: EnvironmentMetadata {
456 operating_system: std::env::consts::OS.into(),
457 architecture: std::env::consts::ARCH.into(),
458 runtime_mode: runtime_mode.into(),
459 initiator_username: get_current_username(),
460 initiator_hostname: get_hostname(),
461 },
462 effective_configuration: config.clone(),
463 input_roots: config
464 .discovery
465 .root_paths
466 .iter()
467 .map(|p| path_to_string(p))
468 .collect(),
469 summary_totals: summary,
470 totals_by_language: language_summaries,
471 per_file_records: analyzed,
472 skipped_file_records: skipped,
473 warnings,
474 submodule_summaries,
475 git_commit_short: git.commit_short,
476 git_commit_long: git.commit_long,
477 git_branch: git.branch,
478 git_commit_author: git.author,
479 git_tags: git.tags,
480 git_nearest_tag: git.nearest_tag,
481 git_commit_date: git.commit_date,
482 }
483}
484
485#[allow(clippy::too_many_lines)]
490pub fn analyze(
491 config: &AppConfig,
493 runtime_mode: &str,
494 cancel: Option<&AtomicBool>,
495) -> Result<AnalysisRun> {
496 config.validate()?;
497
498 if config.discovery.root_paths.is_empty() {
499 anyhow::bail!("no input paths were provided");
500 }
501
502 let include_globs = compile_globset(&config.discovery.include_globs)?;
503 let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
504 let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
505
506 let mut analyzed = Vec::new();
507 let mut skipped = Vec::new();
508 let mut warnings = Vec::new();
509 let mut seen_paths = HashSet::new();
510
511 for root in &config.discovery.root_paths {
512 if cancel.is_some_and(|c| c.load(Ordering::Relaxed)) {
513 anyhow::bail!("analysis cancelled");
514 }
515
516 let root = root.canonicalize().unwrap_or_else(|_| root.clone());
517
518 if root.is_file() {
519 if let Some(record) = analyze_candidate_file(
520 &root,
521 root.parent().unwrap_or_else(|| Path::new(".")),
522 config,
523 include_globs.as_ref(),
524 exclude_globs.as_ref(),
525 enabled_languages.as_ref(),
526 )? {
527 push_record(record, &mut analyzed, &mut skipped, &mut warnings);
528 }
529 continue;
530 }
531
532 walk_root(
533 &root,
534 config,
535 include_globs.as_ref(),
536 exclude_globs.as_ref(),
537 enabled_languages.as_ref(),
538 &mut seen_paths,
539 &mut analyzed,
540 &mut skipped,
541 &mut warnings,
542 cancel,
543 )?;
544 }
545
546 analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
547 skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
548
549 let submodule_summaries = if config.discovery.submodule_breakdown {
551 process_submodules(config, &mut analyzed)
552 } else {
553 Vec::new()
554 };
555
556 if let Some(cov_path) =
559 coverage::resolve_coverage_file(config.analysis.coverage_file.as_deref())
560 {
561 match fs::read_to_string(&cov_path) {
562 Ok(content) => {
563 let cov_map = coverage::parse_coverage_auto(&cov_path, &content);
564 for record in &mut analyzed {
565 record.coverage =
566 coverage::lookup_coverage(&cov_map, &record.relative_path).cloned();
567 }
568 }
569 Err(e) => {
570 warnings.push(format!(
571 "coverage file '{}' could not be read: {e}",
572 cov_path.display()
573 ));
574 }
575 }
576 }
577
578 Ok(assemble_run(
579 config,
580 runtime_mode,
581 analyzed,
582 skipped,
583 warnings,
584 submodule_summaries,
585 ))
586}
587
588fn push_record(
589 record: FileRecord,
590 analyzed: &mut Vec<FileRecord>,
591 skipped: &mut Vec<FileRecord>,
592 warnings: &mut Vec<String>,
593) {
594 warnings.extend(
595 record
596 .warnings
597 .iter()
598 .map(|warning| format!("{}: {warning}", record.relative_path)),
599 );
600
601 match record.status {
602 FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
603 _ => skipped.push(record),
604 }
605}
606
607#[inline]
609fn skip_with_reason(
610 path: &Path,
611 root: &Path,
612 size: u64,
613 reason: impl Into<String>,
614) -> MetadataPolicyOutcome {
615 MetadataPolicyOutcome::Skip(Box::new(skipped_record(
616 path,
617 root,
618 size,
619 FileStatus::SkippedByPolicy,
620 vec![reason.into()],
621 )))
622}
623
624#[allow(clippy::too_many_arguments)]
628fn check_metadata_policy(
629 path: &Path,
630 root: &Path,
631 relative_path: &str,
632 metadata: &fs::Metadata,
633 config: &AppConfig,
634 include_globs: Option<&GlobSet>,
635 exclude_globs: Option<&GlobSet>,
636) -> MetadataPolicyOutcome {
637 let size = metadata.len();
638
639 if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
640 return skip_with_reason(path, root, size, "symlink skipped by policy");
641 }
642 if file_name_eq(path, ".gitignore") {
643 return skip_with_reason(path, root, size, ".gitignore is always excluded");
644 }
645 if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
646 return skip_with_reason(path, root, size, "path matched excluded directory setting");
647 }
648 if size > config.discovery.max_file_size_bytes {
649 return skip_with_reason(
650 path,
651 root,
652 size,
653 format!(
654 "file exceeded max_file_size_bytes ({})",
655 config.discovery.max_file_size_bytes
656 ),
657 );
658 }
659 if let Some(globs) = include_globs {
660 if !globs.is_match(Path::new(relative_path)) && !globs.is_match(path) {
661 return MetadataPolicyOutcome::Exclude;
662 }
663 }
664 if let Some(globs) = exclude_globs {
665 if globs.is_match(Path::new(relative_path)) || globs.is_match(path) {
666 return skip_with_reason(path, root, size, "path matched exclude glob");
667 }
668 }
669 if is_known_lockfile(path) && !config.analysis.include_lockfiles {
670 return skip_with_reason(path, root, size, "lockfile skipped by default policy");
671 }
672
673 MetadataPolicyOutcome::Continue
674}
675
676struct ContentPolicyResult {
677 vendor: bool,
678 generated: bool,
679 minified: bool,
680 skip_record: Option<FileRecord>,
681}
682
683fn check_content_policy(
686 path: &Path,
687 root: &Path,
688 size_bytes: u64,
689 bytes: &[u8],
690 config: &AppConfig,
691) -> ContentPolicyResult {
692 let vendor = is_vendor_path(path);
693 if vendor && config.analysis.vendor_directory_detection {
694 return ContentPolicyResult {
695 vendor,
696 generated: false,
697 minified: false,
698 skip_record: Some(skipped_record(
699 path,
700 root,
701 size_bytes,
702 FileStatus::SkippedByPolicy,
703 vec!["vendor file skipped by policy".into()],
704 )),
705 };
706 }
707
708 let generated = config.analysis.generated_file_detection && looks_generated(path, bytes);
709 if generated {
710 return ContentPolicyResult {
711 vendor,
712 generated,
713 minified: false,
714 skip_record: Some(skipped_record(
715 path,
716 root,
717 size_bytes,
718 FileStatus::SkippedByPolicy,
719 vec!["generated file skipped by policy".into()],
720 )),
721 };
722 }
723
724 let minified = config.analysis.minified_file_detection && looks_minified(path, bytes);
725 if minified {
726 return ContentPolicyResult {
727 vendor,
728 generated,
729 minified,
730 skip_record: Some(skipped_record(
731 path,
732 root,
733 size_bytes,
734 FileStatus::SkippedByPolicy,
735 vec!["minified file skipped by policy".into()],
736 )),
737 };
738 }
739
740 ContentPolicyResult {
741 vendor,
742 generated,
743 minified,
744 skip_record: None,
745 }
746}
747
748fn decode_file_contents(
750 path: &Path,
751 root: &Path,
752 size_bytes: u64,
753 bytes: &[u8],
754 config: &AppConfig,
755) -> Result<Option<(String, String, Vec<String>)>> {
756 if is_binary(bytes) {
757 return match config.analysis.binary_file_behavior {
758 BinaryFileBehavior::Skip => Ok(None),
759 BinaryFileBehavior::Fail => {
760 anyhow::bail!("binary file encountered: {}", path.display())
761 }
762 };
763 }
764
765 match decode_bytes(bytes) {
766 Ok(result) => Ok(Some(result)),
767 Err(err) => match config.analysis.decode_failure_behavior {
768 FailureBehavior::WarnSkip => {
769 let _ = (path, root, size_bytes); Err(anyhow::anyhow!("__decode_warn__: {err}"))
774 }
775 FailureBehavior::Fail => {
776 anyhow::bail!("decode failure for {}: {err}", path.display())
777 }
778 },
779 }
780}
781
782#[allow(clippy::too_many_lines)]
783fn analyze_candidate_file(
784 path: &Path,
786 root: &Path,
787 config: &AppConfig,
788 include_globs: Option<&GlobSet>,
789 exclude_globs: Option<&GlobSet>,
790 enabled_languages: Option<&BTreeSet<Language>>,
791) -> Result<Option<FileRecord>> {
792 let metadata = match fs::symlink_metadata(path) {
793 Ok(metadata) => metadata,
794 Err(err) => {
795 return Ok(Some(skipped_record(
796 path,
797 root,
798 0,
799 FileStatus::ErrorInternal,
800 vec![format!("failed to read metadata: {err}")],
801 )));
802 }
803 };
804
805 let relative_path = relative_path_string(path, root);
806
807 match check_metadata_policy(
809 path,
810 root,
811 &relative_path,
812 &metadata,
813 config,
814 include_globs,
815 exclude_globs,
816 ) {
817 MetadataPolicyOutcome::Skip(record) => return Ok(Some(*record)),
818 MetadataPolicyOutcome::Exclude => return Ok(None),
819 MetadataPolicyOutcome::Continue => {}
820 }
821
822 let bytes = match fs::read(path) {
823 Ok(bytes) => bytes,
824 Err(err) => {
825 return Ok(Some(skipped_record(
826 path,
827 root,
828 metadata.len(),
829 FileStatus::ErrorInternal,
830 vec![format!("failed to read file: {err}")],
831 )));
832 }
833 };
834
835 let content_policy = check_content_policy(path, root, metadata.len(), &bytes, config);
837 if let Some(record) = content_policy.skip_record {
838 return Ok(Some(record));
839 }
840 let (vendor, generated, minified) = (
841 content_policy.vendor,
842 content_policy.generated,
843 content_policy.minified,
844 );
845
846 let (text, encoding, decode_warnings) =
848 match decode_file_contents(path, root, metadata.len(), &bytes, config) {
849 Ok(Some(result)) => result,
850 Ok(None) => {
851 return Ok(Some(skipped_record(
852 path,
853 root,
854 metadata.len(),
855 FileStatus::SkippedBinary,
856 vec!["binary file skipped by default".into()],
857 )));
858 }
859 Err(err) => {
860 let msg = err.to_string();
861 if let Some(warn_msg) = msg.strip_prefix("__decode_warn__: ") {
862 return Ok(Some(skipped_record(
863 path,
864 root,
865 metadata.len(),
866 FileStatus::SkippedDecodeError,
867 vec![warn_msg.to_string()],
868 )));
869 }
870 return Err(err);
871 }
872 };
873
874 let first_line = text.lines().next();
875 let language = detect_language(
876 path,
877 first_line,
878 &config.analysis.extension_overrides,
879 config.analysis.shebang_detection,
880 );
881
882 let Some(language) = language else {
883 return Ok(Some(skipped_record(
884 path,
885 root,
886 metadata.len(),
887 FileStatus::SkippedUnsupported,
888 vec!["unsupported or undetected language".into()],
889 )));
890 };
891
892 if let Some(enabled) = enabled_languages {
893 if !enabled.contains(&language) {
894 return Ok(Some(skipped_record(
895 path,
896 root,
897 metadata.len(),
898 FileStatus::SkippedByPolicy,
899 vec![format!(
900 "language {} disabled by configuration",
901 language.display_name()
902 )],
903 )));
904 }
905 }
906
907 let ieee_opts = AnalysisOptions {
908 blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
909 == BlankInBlockCommentPolicy::CountAsComment,
910 collapse_continuation_lines: config.analysis.continuation_line_policy
911 == ContinuationLinePolicy::CollapseToLogical,
912 };
913 let analysis = analyze_text(language, &text, ieee_opts);
914 let effective_counts = compute_effective_counts(
915 &analysis.raw,
916 config.analysis.mixed_line_policy,
917 config.analysis.python_docstrings_as_comments,
918 config.analysis.count_compiler_directives,
919 );
920
921 let mut warnings = decode_warnings;
922 warnings.extend(analysis.warnings.clone());
923
924 Ok(Some(FileRecord {
925 path: path_to_string(path),
926 relative_path,
927 language: Some(language),
928 size_bytes: metadata.len(),
929 detected_encoding: Some(encoding),
930 raw_line_categories: analysis.raw,
931 effective_counts,
932 status: match analysis.parse_mode {
933 ParseMode::Lexical | ParseMode::TreeSitter => FileStatus::AnalyzedExact,
934 ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
935 },
936 warnings,
937 generated,
938 minified,
939 vendor,
940 parse_mode: Some(analysis.parse_mode),
941 submodule: None,
942 coverage: None,
943 }))
944}
945
946const fn compute_effective_counts(
947 raw: &RawLineCounts,
948 mixed_line_policy: MixedLinePolicy,
949 python_docstrings_as_comments: bool,
950 count_compiler_directives: bool,
951) -> EffectiveCounts {
952 let mut effective = EffectiveCounts {
953 code_lines: raw.code_only_lines,
954 comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
955 blank_lines: raw.blank_only_lines,
956 mixed_lines_separate: 0,
957 };
958
959 if python_docstrings_as_comments {
960 effective.comment_lines += raw.docstring_comment_lines;
961 } else {
962 effective.code_lines += raw.docstring_comment_lines;
963 }
964
965 let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
966 match mixed_line_policy {
967 MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
968 MixedLinePolicy::CodeAndComment => {
969 effective.code_lines += mixed_total;
970 effective.comment_lines += mixed_total;
971 }
972 MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
973 MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
974 }
975
976 if !count_compiler_directives {
979 effective.code_lines = effective
980 .code_lines
981 .saturating_sub(raw.compiler_directive_lines);
982 }
983
984 effective
985}
986
987fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
988 let mut summary = SummaryTotals {
989 files_considered: (analyzed.len() + skipped.len()) as u64,
990 files_analyzed: analyzed.len() as u64,
991 files_skipped: skipped.len() as u64,
992 ..Default::default()
993 };
994
995 for record in analyzed {
996 summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
997 summary.code_lines += record.effective_counts.code_lines;
998 summary.comment_lines += record.effective_counts.comment_lines;
999 summary.blank_lines += record.effective_counts.blank_lines;
1000 summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1001 summary.functions += record.raw_line_categories.functions;
1002 summary.classes += record.raw_line_categories.classes;
1003 summary.variables += record.raw_line_categories.variables;
1004 summary.imports += record.raw_line_categories.imports;
1005 summary.test_count += record.raw_line_categories.test_count;
1006 summary.test_assertion_count += record.raw_line_categories.test_assertion_count;
1007 summary.test_suite_count += record.raw_line_categories.test_suite_count;
1008 if let Some(cov) = &record.coverage {
1009 summary.coverage_lines_found += u64::from(cov.lines_found);
1010 summary.coverage_lines_hit += u64::from(cov.lines_hit);
1011 summary.coverage_functions_found += u64::from(cov.functions_found);
1012 summary.coverage_functions_hit += u64::from(cov.functions_hit);
1013 summary.coverage_branches_found += u64::from(cov.branches_found);
1014 summary.coverage_branches_hit += u64::from(cov.branches_hit);
1015 }
1016 }
1017
1018 summary
1019}
1020
1021const fn zeroed_summary(language: Language) -> LanguageSummary {
1023 LanguageSummary {
1024 language,
1025 files: 0,
1026 total_physical_lines: 0,
1027 code_lines: 0,
1028 comment_lines: 0,
1029 blank_lines: 0,
1030 mixed_lines_separate: 0,
1031 functions: 0,
1032 classes: 0,
1033 variables: 0,
1034 imports: 0,
1035 test_count: 0,
1036 test_assertion_count: 0,
1037 test_suite_count: 0,
1038 coverage_lines_found: 0,
1039 coverage_lines_hit: 0,
1040 coverage_functions_found: 0,
1041 coverage_functions_hit: 0,
1042 coverage_branches_found: 0,
1043 coverage_branches_hit: 0,
1044 }
1045}
1046
1047fn accumulate_record_into_summary(entry: &mut LanguageSummary, record: &FileRecord) {
1049 entry.files += 1;
1050 let r = &record.raw_line_categories;
1051 entry.total_physical_lines += r.total_physical_lines;
1052 entry.code_lines += record.effective_counts.code_lines;
1053 entry.comment_lines += record.effective_counts.comment_lines;
1054 entry.blank_lines += record.effective_counts.blank_lines;
1055 entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
1056 entry.functions += r.functions;
1057 entry.classes += r.classes;
1058 entry.variables += r.variables;
1059 entry.imports += r.imports;
1060 entry.test_count += r.test_count;
1061 entry.test_assertion_count += r.test_assertion_count;
1062 entry.test_suite_count += r.test_suite_count;
1063 if let Some(cov) = &record.coverage {
1064 entry.coverage_lines_found += u64::from(cov.lines_found);
1065 entry.coverage_lines_hit += u64::from(cov.lines_hit);
1066 entry.coverage_functions_found += u64::from(cov.functions_found);
1067 entry.coverage_functions_hit += u64::from(cov.functions_hit);
1068 entry.coverage_branches_found += u64::from(cov.branches_found);
1069 entry.coverage_branches_hit += u64::from(cov.branches_hit);
1070 }
1071}
1072
1073fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
1074 let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
1075 for record in analyzed {
1076 let Some(language) = record.language else {
1077 continue;
1078 };
1079 let entry = by_language
1080 .entry(language)
1081 .or_insert_with(|| zeroed_summary(language));
1082 accumulate_record_into_summary(entry, record);
1083 }
1084 by_language.into_values().collect()
1085}
1086
1087fn skipped_record(
1088 path: &Path,
1089 root: &Path,
1090 size_bytes: u64,
1091 status: FileStatus,
1092 warnings: Vec<String>,
1093) -> FileRecord {
1094 FileRecord {
1095 path: path_to_string(path),
1096 relative_path: relative_path_string(path, root),
1097 language: None,
1098 size_bytes,
1099 detected_encoding: None,
1100 raw_line_categories: RawLineCounts::default(),
1101 effective_counts: EffectiveCounts::default(),
1102 status,
1103 warnings,
1104 generated: false,
1105 minified: false,
1106 vendor: false,
1107 parse_mode: None,
1108 submodule: None,
1109 coverage: None,
1110 }
1111}
1112
1113fn relative_path_string(path: &Path, root: &Path) -> String {
1114 path.strip_prefix(root)
1115 .unwrap_or(path)
1116 .to_string_lossy()
1117 .replace('\\', "/")
1118}
1119
1120fn path_to_string(path: &Path) -> String {
1121 path.to_string_lossy().replace('\\', "/")
1122}
1123
1124#[must_use]
1126pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
1127 let gitmodules = root.join(".gitmodules");
1128 if !gitmodules.is_file() {
1129 return Vec::new();
1130 }
1131 let Ok(content) = fs::read_to_string(&gitmodules) else {
1132 return Vec::new();
1133 };
1134
1135 let mut result = Vec::new();
1136 let mut current_name: Option<String> = None;
1137 let mut current_path: Option<PathBuf> = None;
1138
1139 for line in content.lines() {
1140 let trimmed = line.trim();
1141 if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
1142 if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
1143 result.push((name, path));
1144 }
1145 let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
1146 current_name = Some(name);
1147 } else if let Some(rest) = trimmed.strip_prefix("path") {
1148 if let Some(eq_pos) = rest.find('=') {
1149 let path_str = rest[eq_pos + 1..].trim();
1150 current_path = Some(PathBuf::from(path_str));
1151 }
1152 }
1153 }
1154 if let (Some(name), Some(path)) = (current_name, current_path) {
1155 result.push((name, path));
1156 }
1157
1158 result
1159}
1160
1161fn build_submodule_summaries(
1162 analyzed: &[FileRecord],
1163 submodules: &[(String, PathBuf)],
1164) -> Vec<SubmoduleSummary> {
1165 submodules
1166 .iter()
1167 .map(|(name, path)| {
1168 let files: Vec<&FileRecord> = analyzed
1169 .iter()
1170 .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
1171 .collect();
1172
1173 let files_analyzed = files.len() as u64;
1174 let total_physical_lines = files
1175 .iter()
1176 .map(|f| f.raw_line_categories.total_physical_lines)
1177 .sum();
1178 let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
1179 let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
1180 let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
1181 let language_summaries = build_language_summaries_from_slice(&files);
1182
1183 SubmoduleSummary {
1184 name: name.clone(),
1185 relative_path: path.to_string_lossy().replace('\\', "/"),
1186 files_analyzed,
1187 total_physical_lines,
1188 code_lines,
1189 comment_lines,
1190 blank_lines,
1191 language_summaries,
1192 }
1193 })
1194 .filter(|s| s.files_analyzed > 0)
1195 .collect()
1196}
1197
1198fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
1199 let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
1200 for file in files {
1201 let Some(lang) = file.language else { continue };
1202 let entry = map
1203 .entry(lang.display_name().to_string())
1204 .or_insert_with(|| zeroed_summary(lang));
1205 accumulate_record_into_summary(entry, file);
1206 }
1207 map.into_values().collect()
1208}
1209
1210fn file_name_eq(path: &Path, expected: &str) -> bool {
1211 path.file_name()
1212 .and_then(|name| name.to_str())
1213 .is_some_and(|name| name == expected)
1214}
1215
1216fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
1217 path.components().any(|component| {
1218 component
1219 .as_os_str()
1220 .to_str()
1221 .is_some_and(|part| excluded_dirs.iter().any(|excluded| excluded == part))
1222 })
1223}
1224
1225fn is_vendor_path(path: &Path) -> bool {
1226 path.components().any(|component| {
1227 component
1228 .as_os_str()
1229 .to_str()
1230 .is_some_and(|part| matches!(part, "vendor" | "node_modules" | "packages"))
1231 })
1232}
1233
1234fn is_known_lockfile(path: &Path) -> bool {
1235 path.file_name()
1236 .and_then(|name| name.to_str())
1237 .is_some_and(|name| {
1238 matches!(
1239 name,
1240 "Cargo.lock"
1241 | "package-lock.json"
1242 | "yarn.lock"
1243 | "pnpm-lock.yaml"
1244 | "Pipfile.lock"
1245 | "poetry.lock"
1246 | "composer.lock"
1247 )
1248 })
1249}
1250
1251fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
1252 let file_name = path
1253 .file_name()
1254 .and_then(|name| name.to_str())
1255 .unwrap_or_default();
1256 if file_name.contains(".generated.") || file_name.contains(".g.") {
1257 return true;
1258 }
1259
1260 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(GENERATED_SAMPLE_BYTES)])
1261 .to_ascii_lowercase();
1262 sample.contains("@generated") || sample.contains("generated by")
1263}
1264
1265fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
1266 let file_name = path
1267 .file_name()
1268 .and_then(|name| name.to_str())
1269 .unwrap_or_default();
1270 if file_name.contains(".min.") {
1271 return true;
1272 }
1273
1274 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(MINIFIED_SAMPLE_BYTES)]);
1275 let longest_line = sample.lines().map(str::len).max().unwrap_or(0);
1276 let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
1277 longest_line > MINIFIED_LINE_THRESHOLD && whitespace * 100 < sample.len().max(1)
1278}
1279
1280fn is_binary(bytes: &[u8]) -> bool {
1281 if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
1282 || bytes.starts_with(&[0xFF, 0xFE])
1283 || bytes.starts_with(&[0xFE, 0xFF])
1284 {
1285 return false;
1286 }
1287
1288 let sample = &bytes[..bytes.len().min(BINARY_SAMPLE_BYTES)];
1289 sample.contains(&0)
1290}
1291
1292fn decode_utf16_bom(
1295 bom_stripped: &[u8],
1296 encoding: &'static encoding_rs::Encoding,
1297 label: &str,
1298) -> (String, String, Vec<String>) {
1299 let (cow, _, had_errors) = encoding.decode(bom_stripped);
1300 let mut warnings = Vec::new();
1301 if had_errors {
1302 warnings.push(format!("{label} decode contained replacement characters"));
1303 }
1304 (cow.into_owned(), label.into(), warnings)
1305}
1306
1307fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
1308 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
1309 let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
1310 return Ok((text, "utf-8-bom".into(), vec![]));
1311 }
1312 if bytes.starts_with(&[0xFF, 0xFE]) {
1313 return Ok(decode_utf16_bom(&bytes[2..], UTF_16LE, "utf-16le"));
1314 }
1315 if bytes.starts_with(&[0xFE, 0xFF]) {
1316 return Ok(decode_utf16_bom(&bytes[2..], UTF_16BE, "utf-16be"));
1317 }
1318
1319 #[allow(clippy::option_if_let_else)]
1321 if let Ok(text) = String::from_utf8(bytes.to_vec()) {
1322 Ok((text, "utf-8".into(), vec![]))
1323 } else {
1324 let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
1325 let mut warnings = vec!["decoded using windows-1252 fallback".into()];
1326 if had_errors {
1327 warnings.push("fallback decode contained replacement characters".into());
1328 }
1329 Ok((cow.into_owned(), "windows-1252".into(), warnings))
1330 }
1331}
1332
1333fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1334 if patterns.is_empty() {
1335 return Ok(None);
1336 }
1337
1338 let mut builder = GlobSetBuilder::new();
1339 for pattern in patterns {
1340 builder
1341 .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1342 }
1343 Ok(Some(
1344 builder.build().context("failed to compile glob filters")?,
1345 ))
1346}
1347
1348fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1349 if enabled.is_empty() {
1350 return Ok(None);
1351 }
1352
1353 let supported = supported_languages();
1354 let mut set = BTreeSet::new();
1355 for name in enabled {
1356 let language = Language::from_name(name)
1357 .with_context(|| format!("unsupported language in config: {name}"))?;
1358 if !supported.contains(&language) {
1359 anyhow::bail!("language {name} is not supported in this build");
1360 }
1361 set.insert(language);
1362 }
1363 Ok(Some(set))
1364}
1365
1366pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1370 let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1371 fs::write(output_path, json)
1372 .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1373}
1374
1375pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1379 let contents = fs::read_to_string(path)
1380 .with_context(|| format!("failed to read result file {}", path.display()))?;
1381 serde_json::from_str(&contents)
1382 .with_context(|| format!("failed to parse JSON result {}", path.display()))
1383}
1384
1385#[cfg(test)]
1386mod tests {
1387 use super::*;
1388
1389 #[test]
1390 fn effective_counts_respect_code_only_policy() {
1391 let raw = RawLineCounts {
1392 code_only_lines: 2,
1393 single_comment_only_lines: 1,
1394 mixed_code_single_comment_lines: 3,
1395 docstring_comment_lines: 2,
1396 ..RawLineCounts::default()
1397 };
1398 let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1399 assert_eq!(counts.code_lines, 5);
1400 assert_eq!(counts.comment_lines, 3);
1401 }
1402
1403 #[test]
1404 fn effective_counts_can_separate_mixed() {
1405 let raw = RawLineCounts {
1406 mixed_code_single_comment_lines: 2,
1407 mixed_code_multi_comment_lines: 1,
1408 ..RawLineCounts::default()
1409 };
1410 let counts =
1411 compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1412 assert_eq!(counts.mixed_lines_separate, 3);
1413 assert_eq!(counts.code_lines, 0);
1414 assert_eq!(counts.comment_lines, 0);
1415 }
1416
1417 #[test]
1418 fn windows_1252_fallback_decodes() {
1419 let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1420 let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1421 assert_eq!(encoding, "windows-1252");
1422 assert!(text.contains('–'));
1423 assert!(!warnings.is_empty());
1424 }
1425}