1pub mod delta;
5pub mod history;
6pub use delta::{compute_delta, FileChangeStatus, FileDelta, ScanComparison, SummaryDelta};
7pub use history::{RegistryEntry, ScanRegistry, ScanSummarySnapshot};
8
9use std::collections::{BTreeMap, BTreeSet, HashSet};
10use std::fs;
11use std::path::{Path, PathBuf};
12
13use anyhow::{Context, Result};
14use chrono::{DateTime, Utc};
15use encoding_rs::{UTF_16BE, UTF_16LE, WINDOWS_1252};
16use globset::{Glob, GlobSet, GlobSetBuilder};
17use ignore::WalkBuilder;
18use serde::{Deserialize, Serialize};
19use uuid::Uuid;
20
21use sloc_config::{
22 AppConfig, BinaryFileBehavior, BlankInBlockCommentPolicy, ContinuationLinePolicy,
23 FailureBehavior, MixedLinePolicy,
24};
25use sloc_languages::{
26 analyze_text, detect_language, supported_languages, AnalysisOptions, Language, ParseMode,
27 RawLineCounts,
28};
29
30#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32pub enum FileStatus {
33 AnalyzedExact,
34 AnalyzedBestEffort,
35 SkippedBinary,
36 SkippedDecodeError,
37 SkippedUnsupported,
38 SkippedByPolicy,
39 ErrorInternal,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize, Default)]
43pub struct EffectiveCounts {
44 pub code_lines: u64,
45 pub comment_lines: u64,
46 pub blank_lines: u64,
47 pub mixed_lines_separate: u64,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ToolMetadata {
52 pub name: String,
53 pub version: String,
54 pub run_id: String,
55 pub timestamp_utc: DateTime<Utc>,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct EnvironmentMetadata {
60 pub operating_system: String,
61 pub architecture: String,
62 pub runtime_mode: String,
63 pub initiator_username: String,
64 pub initiator_hostname: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, Default)]
68pub struct SummaryTotals {
69 pub files_considered: u64,
70 pub files_analyzed: u64,
71 pub files_skipped: u64,
72 pub total_physical_lines: u64,
73 pub code_lines: u64,
74 pub comment_lines: u64,
75 pub blank_lines: u64,
76 pub mixed_lines_separate: u64,
77 #[serde(default)]
78 pub functions: u64,
79 #[serde(default)]
80 pub classes: u64,
81 #[serde(default)]
82 pub variables: u64,
83 #[serde(default)]
84 pub imports: u64,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct LanguageSummary {
89 pub language: Language,
90 pub files: u64,
91 pub total_physical_lines: u64,
92 pub code_lines: u64,
93 pub comment_lines: u64,
94 pub blank_lines: u64,
95 pub mixed_lines_separate: u64,
96 #[serde(default)]
97 pub functions: u64,
98 #[serde(default)]
99 pub classes: u64,
100 #[serde(default)]
101 pub variables: u64,
102 #[serde(default)]
103 pub imports: u64,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct FileRecord {
108 pub path: String,
109 pub relative_path: String,
110 pub language: Option<Language>,
111 pub size_bytes: u64,
112 pub detected_encoding: Option<String>,
113 pub raw_line_categories: RawLineCounts,
114 pub effective_counts: EffectiveCounts,
115 pub status: FileStatus,
116 pub warnings: Vec<String>,
117 pub generated: bool,
118 pub minified: bool,
119 pub vendor: bool,
120 pub parse_mode: Option<ParseMode>,
121 #[serde(skip_serializing_if = "Option::is_none")]
122 pub submodule: Option<String>,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct SubmoduleSummary {
128 pub name: String,
129 pub relative_path: String,
130 pub files_analyzed: u64,
131 pub total_physical_lines: u64,
132 pub code_lines: u64,
133 pub comment_lines: u64,
134 pub blank_lines: u64,
135 pub language_summaries: Vec<LanguageSummary>,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct AnalysisRun {
140 pub tool: ToolMetadata,
141 pub environment: EnvironmentMetadata,
142 pub effective_configuration: AppConfig,
143 pub input_roots: Vec<String>,
144 pub summary_totals: SummaryTotals,
145 pub totals_by_language: Vec<LanguageSummary>,
146 pub per_file_records: Vec<FileRecord>,
147 pub skipped_file_records: Vec<FileRecord>,
148 pub warnings: Vec<String>,
149 #[serde(default, skip_serializing_if = "Vec::is_empty")]
151 pub submodule_summaries: Vec<SubmoduleSummary>,
152 #[serde(default, skip_serializing_if = "Option::is_none")]
154 pub git_commit_short: Option<String>,
155 #[serde(default, skip_serializing_if = "Option::is_none")]
157 pub git_commit_long: Option<String>,
158 #[serde(default, skip_serializing_if = "Option::is_none")]
160 pub git_branch: Option<String>,
161 #[serde(default, skip_serializing_if = "Option::is_none")]
163 pub git_commit_author: Option<String>,
164 #[serde(default, skip_serializing_if = "Option::is_none")]
166 pub git_tags: Option<String>,
167}
168
169fn run_git_in(dir: &Path, args: &[&str]) -> Option<String> {
170 std::process::Command::new("git")
171 .args(args)
172 .current_dir(dir)
173 .output()
174 .ok()
175 .filter(|o| o.status.success())
176 .and_then(|o| String::from_utf8(o.stdout).ok())
177 .map(|s| s.trim().to_string())
178 .filter(|s| !s.is_empty())
179}
180
181#[derive(Default)]
182struct GitInfo {
183 commit_short: Option<String>,
184 commit_long: Option<String>,
185 branch: Option<String>,
186 author: Option<String>,
187 tags: Option<String>,
188}
189
190fn detect_git_for_run(project_path: &Path) -> GitInfo {
191 GitInfo {
192 commit_short: run_git_in(project_path, &["rev-parse", "--short", "HEAD"]),
193 commit_long: run_git_in(project_path, &["rev-parse", "HEAD"]),
194 branch: run_git_in(project_path, &["branch", "--show-current"]),
195 author: run_git_in(project_path, &["log", "--format=%an", "-1"]),
196 tags: run_git_in(project_path, &["tag", "--points-at", "HEAD"]).map(|t| {
197 t.lines()
198 .filter(|l| !l.is_empty())
199 .collect::<Vec<_>>()
200 .join(", ")
201 }),
202 }
203}
204
205fn get_current_username() -> String {
206 std::env::var("USERNAME")
207 .or_else(|_| std::env::var("USER"))
208 .unwrap_or_else(|_| "unknown".to_string())
209}
210
211fn get_hostname() -> String {
212 std::env::var("COMPUTERNAME")
213 .or_else(|_| std::env::var("HOSTNAME"))
214 .or_else(|_| std::fs::read_to_string("/etc/hostname").map(|s| s.trim().to_string()))
215 .unwrap_or_else(|_| "unknown".to_string())
216}
217
218pub fn analyze(config: &AppConfig, runtime_mode: &str) -> Result<AnalysisRun> {
219 config.validate()?;
220
221 if config.discovery.root_paths.is_empty() {
222 anyhow::bail!("no input paths were provided");
223 }
224
225 let include_globs = compile_globset(&config.discovery.include_globs)?;
226 let exclude_globs = compile_globset(&config.discovery.exclude_globs)?;
227 let enabled_languages = parse_enabled_languages(&config.analysis.enabled_languages)?;
228
229 let mut analyzed = Vec::new();
230 let mut skipped = Vec::new();
231 let mut warnings = Vec::new();
232 let mut seen_paths = HashSet::new();
233
234 for root in &config.discovery.root_paths {
235 let root = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
236
237 if root.is_file() {
238 if let Some(record) = analyze_candidate_file(
239 &root,
240 root.parent().unwrap_or(Path::new(".")),
241 config,
242 &include_globs,
243 &exclude_globs,
244 &enabled_languages,
245 )? {
246 push_record(record, &mut analyzed, &mut skipped, &mut warnings);
247 }
248 continue;
249 }
250
251 let mut builder = WalkBuilder::new(&root);
252 builder
253 .follow_links(config.discovery.follow_symlinks)
254 .hidden(config.discovery.ignore_hidden_files)
255 .ignore(config.discovery.honor_ignore_files)
256 .parents(config.discovery.honor_ignore_files)
257 .git_ignore(config.discovery.honor_ignore_files)
258 .git_global(config.discovery.honor_ignore_files)
259 .git_exclude(config.discovery.honor_ignore_files);
260
261 for entry in builder.build() {
262 let entry = match entry {
263 Ok(entry) => entry,
264 Err(err) => {
265 warnings.push(format!("discovery warning: {err}"));
266 continue;
267 }
268 };
269
270 let path = entry.into_path();
271 if path.is_dir() {
272 continue;
273 }
274 if !seen_paths.insert(path.clone()) {
275 continue;
276 }
277
278 if let Some(record) = analyze_candidate_file(
279 &path,
280 &root,
281 config,
282 &include_globs,
283 &exclude_globs,
284 &enabled_languages,
285 )? {
286 push_record(record, &mut analyzed, &mut skipped, &mut warnings);
287 }
288 }
289 }
290
291 analyzed.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
292 skipped.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
293
294 let submodule_summaries = if config.discovery.submodule_breakdown {
296 let root = config.discovery.root_paths[0]
297 .canonicalize()
298 .unwrap_or_else(|_| config.discovery.root_paths[0].clone());
299 let submodules = detect_submodules(&root);
300 if !submodules.is_empty() {
301 for file in &mut analyzed {
302 for (name, sub_path) in &submodules {
303 let prefix = sub_path.to_string_lossy().replace('\\', "/");
304 let rel = &file.relative_path;
305 if rel == &prefix || rel.starts_with(&format!("{prefix}/")) {
306 file.submodule = Some(name.clone());
307 break;
308 }
309 }
310 }
311 build_submodule_summaries(&analyzed, &submodules)
312 } else {
313 Vec::new()
314 }
315 } else {
316 Vec::new()
317 };
318
319 let summary = build_summary(&analyzed, &skipped);
320 let language_summaries = build_language_summaries(&analyzed);
321
322 let first_root = config
324 .discovery
325 .root_paths
326 .first()
327 .map(|p| p.canonicalize().unwrap_or_else(|_| p.to_path_buf()));
328 let git = first_root
329 .as_deref()
330 .map(detect_git_for_run)
331 .unwrap_or_default();
332
333 let now = Utc::now();
334 let run_id = {
335 let uuid_suffix = Uuid::new_v4().simple().to_string();
336 format!("{}-{}", now.format("%Y%m%d-%H%M"), uuid_suffix)
337 };
338
339 Ok(AnalysisRun {
340 tool: ToolMetadata {
341 name: "sloc".into(),
342 version: env!("CARGO_PKG_VERSION").into(),
343 run_id,
344 timestamp_utc: now,
345 },
346 environment: EnvironmentMetadata {
347 operating_system: std::env::consts::OS.into(),
348 architecture: std::env::consts::ARCH.into(),
349 runtime_mode: runtime_mode.into(),
350 initiator_username: get_current_username(),
351 initiator_hostname: get_hostname(),
352 },
353 effective_configuration: config.clone(),
354 input_roots: config
355 .discovery
356 .root_paths
357 .iter()
358 .map(|p| path_to_string(p))
359 .collect(),
360 summary_totals: summary,
361 totals_by_language: language_summaries,
362 per_file_records: analyzed,
363 skipped_file_records: skipped,
364 warnings,
365 submodule_summaries,
366 git_commit_short: git.commit_short,
367 git_commit_long: git.commit_long,
368 git_branch: git.branch,
369 git_commit_author: git.author,
370 git_tags: git.tags,
371 })
372}
373
374fn push_record(
375 record: FileRecord,
376 analyzed: &mut Vec<FileRecord>,
377 skipped: &mut Vec<FileRecord>,
378 warnings: &mut Vec<String>,
379) {
380 warnings.extend(
381 record
382 .warnings
383 .iter()
384 .map(|warning| format!("{}: {warning}", record.relative_path)),
385 );
386
387 match record.status {
388 FileStatus::AnalyzedExact | FileStatus::AnalyzedBestEffort => analyzed.push(record),
389 _ => skipped.push(record),
390 }
391}
392
393fn analyze_candidate_file(
394 path: &Path,
395 root: &Path,
396 config: &AppConfig,
397 include_globs: &Option<GlobSet>,
398 exclude_globs: &Option<GlobSet>,
399 enabled_languages: &Option<BTreeSet<Language>>,
400) -> Result<Option<FileRecord>> {
401 let metadata = match fs::symlink_metadata(path) {
402 Ok(metadata) => metadata,
403 Err(err) => {
404 return Ok(Some(skipped_record(
405 path,
406 root,
407 0,
408 FileStatus::ErrorInternal,
409 vec![format!("failed to read metadata: {err}")],
410 )));
411 }
412 };
413
414 if metadata.file_type().is_symlink() && !config.discovery.follow_symlinks {
415 return Ok(Some(skipped_record(
416 path,
417 root,
418 metadata.len(),
419 FileStatus::SkippedByPolicy,
420 vec!["symlink skipped by policy".into()],
421 )));
422 }
423
424 let relative_path = relative_path_string(path, root);
425
426 if file_name_eq(path, ".gitignore") {
427 return Ok(Some(skipped_record(
428 path,
429 root,
430 metadata.len(),
431 FileStatus::SkippedByPolicy,
432 vec![".gitignore is always excluded".into()],
433 )));
434 }
435
436 if is_excluded_dir_path(path, &config.discovery.excluded_directories) {
437 return Ok(Some(skipped_record(
438 path,
439 root,
440 metadata.len(),
441 FileStatus::SkippedByPolicy,
442 vec!["path matched excluded directory setting".into()],
443 )));
444 }
445
446 if metadata.len() > config.discovery.max_file_size_bytes {
447 return Ok(Some(skipped_record(
448 path,
449 root,
450 metadata.len(),
451 FileStatus::SkippedByPolicy,
452 vec![format!(
453 "file exceeded max_file_size_bytes ({})",
454 config.discovery.max_file_size_bytes
455 )],
456 )));
457 }
458
459 if let Some(globs) = include_globs {
460 if !globs.is_match(Path::new(&relative_path)) && !globs.is_match(path) {
461 return Ok(None);
462 }
463 }
464
465 if let Some(globs) = exclude_globs {
466 if globs.is_match(Path::new(&relative_path)) || globs.is_match(path) {
467 return Ok(Some(skipped_record(
468 path,
469 root,
470 metadata.len(),
471 FileStatus::SkippedByPolicy,
472 vec!["path matched exclude glob".into()],
473 )));
474 }
475 }
476
477 if is_known_lockfile(path) && !config.analysis.include_lockfiles {
478 return Ok(Some(skipped_record(
479 path,
480 root,
481 metadata.len(),
482 FileStatus::SkippedByPolicy,
483 vec!["lockfile skipped by default policy".into()],
484 )));
485 }
486
487 let bytes = match fs::read(path) {
488 Ok(bytes) => bytes,
489 Err(err) => {
490 return Ok(Some(skipped_record(
491 path,
492 root,
493 metadata.len(),
494 FileStatus::ErrorInternal,
495 vec![format!("failed to read file: {err}")],
496 )));
497 }
498 };
499
500 let vendor = is_vendor_path(path);
501 if vendor && config.analysis.vendor_directory_detection {
502 return Ok(Some(skipped_record(
503 path,
504 root,
505 metadata.len(),
506 FileStatus::SkippedByPolicy,
507 vec!["vendor file skipped by policy".into()],
508 )));
509 }
510
511 let generated = config.analysis.generated_file_detection && looks_generated(path, &bytes);
512 if generated {
513 return Ok(Some(skipped_record(
514 path,
515 root,
516 metadata.len(),
517 FileStatus::SkippedByPolicy,
518 vec!["generated file skipped by policy".into()],
519 )));
520 }
521
522 let minified = config.analysis.minified_file_detection && looks_minified(path, &bytes);
523 if minified {
524 return Ok(Some(skipped_record(
525 path,
526 root,
527 metadata.len(),
528 FileStatus::SkippedByPolicy,
529 vec!["minified file skipped by policy".into()],
530 )));
531 }
532
533 if is_binary(&bytes) {
534 return match config.analysis.binary_file_behavior {
535 BinaryFileBehavior::Skip => Ok(Some(skipped_record(
536 path,
537 root,
538 metadata.len(),
539 FileStatus::SkippedBinary,
540 vec!["binary file skipped by default".into()],
541 ))),
542 BinaryFileBehavior::Fail => {
543 anyhow::bail!("binary file encountered: {}", path.display())
544 }
545 };
546 }
547
548 let (text, encoding, decode_warnings) = match decode_bytes(&bytes) {
549 Ok(result) => result,
550 Err(err) => {
551 return match config.analysis.decode_failure_behavior {
552 FailureBehavior::WarnSkip => Ok(Some(skipped_record(
553 path,
554 root,
555 metadata.len(),
556 FileStatus::SkippedDecodeError,
557 vec![err],
558 ))),
559 FailureBehavior::Fail => {
560 anyhow::bail!("decode failure for {}: {err}", path.display())
561 }
562 };
563 }
564 };
565
566 let first_line = text.lines().next();
567 let language = detect_language(
568 path,
569 first_line,
570 &config.analysis.extension_overrides,
571 config.analysis.shebang_detection,
572 );
573
574 let Some(language) = language else {
575 return Ok(Some(skipped_record(
576 path,
577 root,
578 metadata.len(),
579 FileStatus::SkippedUnsupported,
580 vec!["unsupported or undetected language".into()],
581 )));
582 };
583
584 if let Some(enabled) = enabled_languages {
585 if !enabled.contains(&language) {
586 return Ok(Some(skipped_record(
587 path,
588 root,
589 metadata.len(),
590 FileStatus::SkippedByPolicy,
591 vec![format!(
592 "language {} disabled by configuration",
593 language.display_name()
594 )],
595 )));
596 }
597 }
598
599 let ieee_opts = AnalysisOptions {
600 blank_in_block_comment_as_comment: config.analysis.blank_in_block_comment_policy
601 == BlankInBlockCommentPolicy::CountAsComment,
602 collapse_continuation_lines: config.analysis.continuation_line_policy
603 == ContinuationLinePolicy::CollapseToLogical,
604 };
605 let analysis = analyze_text(language, &text, ieee_opts);
606 let effective_counts = compute_effective_counts(
607 &analysis.raw,
608 config.analysis.mixed_line_policy,
609 config.analysis.python_docstrings_as_comments,
610 config.analysis.count_compiler_directives,
611 );
612
613 let mut warnings = decode_warnings;
614 warnings.extend(analysis.warnings.clone());
615
616 Ok(Some(FileRecord {
617 path: path_to_string(path),
618 relative_path,
619 language: Some(language),
620 size_bytes: metadata.len(),
621 detected_encoding: Some(encoding),
622 raw_line_categories: analysis.raw,
623 effective_counts,
624 status: match analysis.parse_mode {
625 ParseMode::Lexical => FileStatus::AnalyzedExact,
626 ParseMode::LexicalBestEffort => FileStatus::AnalyzedBestEffort,
627 ParseMode::TreeSitter => FileStatus::AnalyzedExact,
628 },
629 warnings,
630 generated,
631 minified,
632 vendor,
633 parse_mode: Some(analysis.parse_mode),
634 submodule: None,
635 }))
636}
637
638fn compute_effective_counts(
639 raw: &RawLineCounts,
640 mixed_line_policy: MixedLinePolicy,
641 python_docstrings_as_comments: bool,
642 count_compiler_directives: bool,
643) -> EffectiveCounts {
644 let mut effective = EffectiveCounts {
645 code_lines: raw.code_only_lines,
646 comment_lines: raw.single_comment_only_lines + raw.multi_comment_only_lines,
647 blank_lines: raw.blank_only_lines,
648 mixed_lines_separate: 0,
649 };
650
651 if python_docstrings_as_comments {
652 effective.comment_lines += raw.docstring_comment_lines;
653 } else {
654 effective.code_lines += raw.docstring_comment_lines;
655 }
656
657 let mixed_total = raw.mixed_code_single_comment_lines + raw.mixed_code_multi_comment_lines;
658 match mixed_line_policy {
659 MixedLinePolicy::CodeOnly => effective.code_lines += mixed_total,
660 MixedLinePolicy::CodeAndComment => {
661 effective.code_lines += mixed_total;
662 effective.comment_lines += mixed_total;
663 }
664 MixedLinePolicy::CommentOnly => effective.comment_lines += mixed_total,
665 MixedLinePolicy::SeparateMixedCategory => effective.mixed_lines_separate += mixed_total,
666 }
667
668 if !count_compiler_directives {
671 effective.code_lines = effective
672 .code_lines
673 .saturating_sub(raw.compiler_directive_lines);
674 }
675
676 effective
677}
678
679fn build_summary(analyzed: &[FileRecord], skipped: &[FileRecord]) -> SummaryTotals {
680 let mut summary = SummaryTotals {
681 files_considered: (analyzed.len() + skipped.len()) as u64,
682 files_analyzed: analyzed.len() as u64,
683 files_skipped: skipped.len() as u64,
684 ..Default::default()
685 };
686
687 for record in analyzed {
688 summary.total_physical_lines += record.raw_line_categories.total_physical_lines;
689 summary.code_lines += record.effective_counts.code_lines;
690 summary.comment_lines += record.effective_counts.comment_lines;
691 summary.blank_lines += record.effective_counts.blank_lines;
692 summary.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
693 summary.functions += record.raw_line_categories.functions;
694 summary.classes += record.raw_line_categories.classes;
695 summary.variables += record.raw_line_categories.variables;
696 summary.imports += record.raw_line_categories.imports;
697 }
698
699 summary
700}
701
702fn build_language_summaries(analyzed: &[FileRecord]) -> Vec<LanguageSummary> {
703 let mut by_language: BTreeMap<Language, LanguageSummary> = BTreeMap::new();
704 for record in analyzed {
705 let Some(language) = record.language else {
706 continue;
707 };
708 let entry = by_language.entry(language).or_insert(LanguageSummary {
709 language,
710 files: 0,
711 total_physical_lines: 0,
712 code_lines: 0,
713 comment_lines: 0,
714 blank_lines: 0,
715 mixed_lines_separate: 0,
716 functions: 0,
717 classes: 0,
718 variables: 0,
719 imports: 0,
720 });
721 entry.files += 1;
722 entry.total_physical_lines += record.raw_line_categories.total_physical_lines;
723 entry.code_lines += record.effective_counts.code_lines;
724 entry.comment_lines += record.effective_counts.comment_lines;
725 entry.blank_lines += record.effective_counts.blank_lines;
726 entry.mixed_lines_separate += record.effective_counts.mixed_lines_separate;
727 entry.functions += record.raw_line_categories.functions;
728 entry.classes += record.raw_line_categories.classes;
729 entry.variables += record.raw_line_categories.variables;
730 entry.imports += record.raw_line_categories.imports;
731 }
732
733 by_language.into_values().collect()
734}
735
736fn skipped_record(
737 path: &Path,
738 root: &Path,
739 size_bytes: u64,
740 status: FileStatus,
741 warnings: Vec<String>,
742) -> FileRecord {
743 FileRecord {
744 path: path_to_string(path),
745 relative_path: relative_path_string(path, root),
746 language: None,
747 size_bytes,
748 detected_encoding: None,
749 raw_line_categories: RawLineCounts::default(),
750 effective_counts: EffectiveCounts::default(),
751 status,
752 warnings,
753 generated: false,
754 minified: false,
755 vendor: false,
756 parse_mode: None,
757 submodule: None,
758 }
759}
760
761fn relative_path_string(path: &Path, root: &Path) -> String {
762 path.strip_prefix(root)
763 .unwrap_or(path)
764 .to_string_lossy()
765 .replace('\\', "/")
766}
767
768fn path_to_string(path: &Path) -> String {
769 path.to_string_lossy().replace('\\', "/")
770}
771
772pub fn detect_submodules(root: &Path) -> Vec<(String, PathBuf)> {
774 let gitmodules = root.join(".gitmodules");
775 if !gitmodules.is_file() {
776 return Vec::new();
777 }
778 let content = match fs::read_to_string(&gitmodules) {
779 Ok(c) => c,
780 Err(_) => return Vec::new(),
781 };
782
783 let mut result = Vec::new();
784 let mut current_name: Option<String> = None;
785 let mut current_path: Option<PathBuf> = None;
786
787 for line in content.lines() {
788 let trimmed = line.trim();
789 if trimmed.starts_with("[submodule \"") && trimmed.ends_with("\"]") {
790 if let (Some(name), Some(path)) = (current_name.take(), current_path.take()) {
791 result.push((name, path));
792 }
793 let name = trimmed["[submodule \"".len()..trimmed.len() - 2].to_string();
794 current_name = Some(name);
795 } else if let Some(rest) = trimmed.strip_prefix("path") {
796 if let Some(eq_pos) = rest.find('=') {
797 let path_str = rest[eq_pos + 1..].trim();
798 current_path = Some(PathBuf::from(path_str));
799 }
800 }
801 }
802 if let (Some(name), Some(path)) = (current_name, current_path) {
803 result.push((name, path));
804 }
805
806 result
807}
808
809fn build_submodule_summaries(
810 analyzed: &[FileRecord],
811 submodules: &[(String, PathBuf)],
812) -> Vec<SubmoduleSummary> {
813 submodules
814 .iter()
815 .map(|(name, path)| {
816 let files: Vec<&FileRecord> = analyzed
817 .iter()
818 .filter(|f| f.submodule.as_deref() == Some(name.as_str()))
819 .collect();
820
821 let files_analyzed = files.len() as u64;
822 let total_physical_lines = files
823 .iter()
824 .map(|f| f.raw_line_categories.total_physical_lines)
825 .sum();
826 let code_lines = files.iter().map(|f| f.effective_counts.code_lines).sum();
827 let comment_lines = files.iter().map(|f| f.effective_counts.comment_lines).sum();
828 let blank_lines = files.iter().map(|f| f.effective_counts.blank_lines).sum();
829 let language_summaries = build_language_summaries_from_slice(&files);
830
831 SubmoduleSummary {
832 name: name.clone(),
833 relative_path: path.to_string_lossy().replace('\\', "/"),
834 files_analyzed,
835 total_physical_lines,
836 code_lines,
837 comment_lines,
838 blank_lines,
839 language_summaries,
840 }
841 })
842 .filter(|s| s.files_analyzed > 0)
843 .collect()
844}
845
846fn build_language_summaries_from_slice(files: &[&FileRecord]) -> Vec<LanguageSummary> {
847 let mut map: BTreeMap<String, LanguageSummary> = BTreeMap::new();
848 for file in files {
849 if let Some(lang) = file.language {
850 let entry = map
851 .entry(lang.display_name().to_string())
852 .or_insert_with(|| LanguageSummary {
853 language: lang,
854 files: 0,
855 total_physical_lines: 0,
856 code_lines: 0,
857 comment_lines: 0,
858 blank_lines: 0,
859 mixed_lines_separate: 0,
860 functions: 0,
861 classes: 0,
862 variables: 0,
863 imports: 0,
864 });
865 entry.files += 1;
866 let r = &file.raw_line_categories;
867 entry.total_physical_lines += r.total_physical_lines;
868 entry.code_lines += file.effective_counts.code_lines;
869 entry.comment_lines += file.effective_counts.comment_lines;
870 entry.blank_lines += file.effective_counts.blank_lines;
871 entry.mixed_lines_separate += file.effective_counts.mixed_lines_separate;
872 }
873 }
874 map.into_values().collect()
875}
876
877fn file_name_eq(path: &Path, expected: &str) -> bool {
878 path.file_name()
879 .and_then(|name| name.to_str())
880 .map(|name| name == expected)
881 .unwrap_or(false)
882}
883
884fn is_excluded_dir_path(path: &Path, excluded_dirs: &[String]) -> bool {
885 path.components().any(|component| {
886 component
887 .as_os_str()
888 .to_str()
889 .map(|part| excluded_dirs.iter().any(|excluded| excluded == part))
890 .unwrap_or(false)
891 })
892}
893
894fn is_vendor_path(path: &Path) -> bool {
895 path.components().any(|component| {
896 component
897 .as_os_str()
898 .to_str()
899 .map(|part| matches!(part, "vendor" | "node_modules" | "packages"))
900 .unwrap_or(false)
901 })
902}
903
904fn is_known_lockfile(path: &Path) -> bool {
905 path.file_name()
906 .and_then(|name| name.to_str())
907 .map(|name| {
908 matches!(
909 name,
910 "Cargo.lock"
911 | "package-lock.json"
912 | "yarn.lock"
913 | "pnpm-lock.yaml"
914 | "Pipfile.lock"
915 | "poetry.lock"
916 | "composer.lock"
917 )
918 })
919 .unwrap_or(false)
920}
921
922fn looks_generated(path: &Path, bytes: &[u8]) -> bool {
923 let file_name = path
924 .file_name()
925 .and_then(|name| name.to_str())
926 .unwrap_or_default();
927 if file_name.contains(".generated.") || file_name.contains(".g.") {
928 return true;
929 }
930
931 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(1024)]).to_ascii_lowercase();
932 sample.contains("@generated") || sample.contains("generated by")
933}
934
935fn looks_minified(path: &Path, bytes: &[u8]) -> bool {
936 let file_name = path
937 .file_name()
938 .and_then(|name| name.to_str())
939 .unwrap_or_default();
940 if file_name.contains(".min.") {
941 return true;
942 }
943
944 let sample = String::from_utf8_lossy(&bytes[..bytes.len().min(4096)]);
945 let longest_line = sample.lines().map(|line| line.len()).max().unwrap_or(0);
946 let whitespace = sample.chars().filter(|c| c.is_whitespace()).count();
947 longest_line > 2000 && whitespace * 100 < sample.len().max(1)
948}
949
950fn is_binary(bytes: &[u8]) -> bool {
951 if bytes.starts_with(&[0xEF, 0xBB, 0xBF])
952 || bytes.starts_with(&[0xFF, 0xFE])
953 || bytes.starts_with(&[0xFE, 0xFF])
954 {
955 return false;
956 }
957
958 let sample = &bytes[..bytes.len().min(8192)];
959 sample.contains(&0)
960}
961
962fn decode_bytes(bytes: &[u8]) -> std::result::Result<(String, String, Vec<String>), String> {
963 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
964 let text = String::from_utf8(bytes[3..].to_vec()).map_err(|err| err.to_string())?;
965 return Ok((text, "utf-8-bom".into(), vec![]));
966 }
967
968 if bytes.starts_with(&[0xFF, 0xFE]) {
969 let (cow, _, had_errors) = UTF_16LE.decode(&bytes[2..]);
970 let mut warnings = Vec::new();
971 if had_errors {
972 warnings.push("utf-16le decode contained replacement characters".into());
973 }
974 return Ok((cow.into_owned(), "utf-16le".into(), warnings));
975 }
976
977 if bytes.starts_with(&[0xFE, 0xFF]) {
978 let (cow, _, had_errors) = UTF_16BE.decode(&bytes[2..]);
979 let mut warnings = Vec::new();
980 if had_errors {
981 warnings.push("utf-16be decode contained replacement characters".into());
982 }
983 return Ok((cow.into_owned(), "utf-16be".into(), warnings));
984 }
985
986 match String::from_utf8(bytes.to_vec()) {
987 Ok(text) => Ok((text, "utf-8".into(), vec![])),
988 Err(_) => {
989 let (cow, _, had_errors) = WINDOWS_1252.decode(bytes);
990 let mut warnings = vec!["decoded using windows-1252 fallback".into()];
991 if had_errors {
992 warnings.push("fallback decode contained replacement characters".into());
993 }
994 Ok((cow.into_owned(), "windows-1252".into(), warnings))
995 }
996 }
997}
998
999fn compile_globset(patterns: &[String]) -> Result<Option<GlobSet>> {
1000 if patterns.is_empty() {
1001 return Ok(None);
1002 }
1003
1004 let mut builder = GlobSetBuilder::new();
1005 for pattern in patterns {
1006 builder
1007 .add(Glob::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?);
1008 }
1009 Ok(Some(
1010 builder.build().context("failed to compile glob filters")?,
1011 ))
1012}
1013
1014fn parse_enabled_languages(enabled: &[String]) -> Result<Option<BTreeSet<Language>>> {
1015 if enabled.is_empty() {
1016 return Ok(None);
1017 }
1018
1019 let supported = supported_languages();
1020 let mut set = BTreeSet::new();
1021 for name in enabled {
1022 let language = Language::from_name(name)
1023 .with_context(|| format!("unsupported language in config: {name}"))?;
1024 if !supported.contains(&language) {
1025 anyhow::bail!("language {name} is not supported in this build");
1026 }
1027 set.insert(language);
1028 }
1029 Ok(Some(set))
1030}
1031
1032pub fn write_json(run: &AnalysisRun, output_path: &Path) -> Result<()> {
1033 let json = serde_json::to_string_pretty(run).context("failed to serialize analysis run")?;
1034 fs::write(output_path, json)
1035 .with_context(|| format!("failed to write JSON output to {}", output_path.display()))
1036}
1037
1038pub fn read_json(path: &Path) -> Result<AnalysisRun> {
1039 let contents = fs::read_to_string(path)
1040 .with_context(|| format!("failed to read result file {}", path.display()))?;
1041 serde_json::from_str(&contents)
1042 .with_context(|| format!("failed to parse JSON result {}", path.display()))
1043}
1044
1045#[cfg(test)]
1046mod tests {
1047 use super::*;
1048
1049 #[test]
1050 fn effective_counts_respect_code_only_policy() {
1051 let raw = RawLineCounts {
1052 code_only_lines: 2,
1053 single_comment_only_lines: 1,
1054 mixed_code_single_comment_lines: 3,
1055 docstring_comment_lines: 2,
1056 ..RawLineCounts::default()
1057 };
1058 let counts = compute_effective_counts(&raw, MixedLinePolicy::CodeOnly, true, true);
1059 assert_eq!(counts.code_lines, 5);
1060 assert_eq!(counts.comment_lines, 3);
1061 }
1062
1063 #[test]
1064 fn effective_counts_can_separate_mixed() {
1065 let raw = RawLineCounts {
1066 mixed_code_single_comment_lines: 2,
1067 mixed_code_multi_comment_lines: 1,
1068 ..RawLineCounts::default()
1069 };
1070 let counts =
1071 compute_effective_counts(&raw, MixedLinePolicy::SeparateMixedCategory, true, true);
1072 assert_eq!(counts.mixed_lines_separate, 3);
1073 assert_eq!(counts.code_lines, 0);
1074 assert_eq!(counts.comment_lines, 0);
1075 }
1076
1077 #[test]
1078 fn windows_1252_fallback_decodes() {
1079 let bytes = vec![0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x96, 0x57];
1080 let (text, encoding, warnings) = decode_bytes(&bytes).unwrap();
1081 assert_eq!(encoding, "windows-1252");
1082 assert!(text.contains('–'));
1083 assert!(!warnings.is_empty());
1084 }
1085}