1use crate::assembly;
5use crate::cache::{
6 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19 collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23 SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26 normalize_top_level_output_paths, populate_info_resource_counts,
27 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
28 trim_preloaded_assembly_to_files,
29};
30use crate::scanner::{
31 CollectionFrontier, LicenseScanOptions, TextDetectionOptions, collect_paths,
32 collect_selected_paths, process_collected_with_memory_limit,
33 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
34};
35use crate::time::format_scancode_timestamp;
36use crate::utils::hash::calculate_sha256;
37use anyhow::{Result, anyhow};
38use chrono::Utc;
39use clap::Parser;
40use regex::Regex;
41use std::collections::{BTreeMap, HashMap};
42use std::env;
43use std::fs;
44use std::io::Read;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use std::time::Instant;
48
49pub fn run() -> Result<()> {
50 #[cfg(feature = "golden-tests")]
51 touch_license_golden_symbols();
52
53 let cli = Cli::parse();
54
55 validate_scan_option_compatibility(&cli)?;
56
57 if cli.show_attribution {
58 print!("{}", include_str!("../../../NOTICE"));
59 return Ok(());
60 }
61
62 if let Some(export_dir) = cli.export_license_dataset.as_deref() {
63 export_embedded_license_dataset(Path::new(export_dir))?;
64 return Ok(());
65 }
66
67 let start_time = Utc::now();
68 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
69 progress.set_processes(cli.processes);
70 progress.set_scan_names(configured_scan_names(&cli));
71 progress.init_logging_bridge();
72 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
73
74 progress.start_setup();
75 let facet_rules = build_facet_rules(&cli.facet)?;
76
77 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
78 let ignore_copyright_holder_patterns =
79 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
80 progress.finish_setup();
81
82 progress.start_discovery();
83
84 let mut shared_cache_config = if cli.from_json {
85 let cache_config = prepare_cache_config(None, &cli)?;
86 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
87 Some(cache_config)
88 } else {
89 None
90 };
91
92 let (
93 mut scan_result,
94 total_dirs,
95 mut preloaded_assembly,
96 preloaded_license_detections,
97 preloaded_license_references,
98 preloaded_license_rule_references,
99 preloaded_extra_errors,
100 extra_warnings,
101 imported_spdx_license_list_version,
102 imported_license_index_provenance,
103 mut active_license_engine,
104 ) = if cli.from_json {
105 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
106 let directories_count = loaded.directory_count();
107 let files_count = loaded.file_count();
108 let size_count = loaded.file_size_count();
109 progress.finish_discovery(
110 files_count,
111 directories_count,
112 size_count,
113 loaded.excluded_count,
114 );
115 let (
116 process_result,
117 assembly_result,
118 license_detections,
119 license_references,
120 license_rule_references,
121 extra_errors,
122 imported_spdx_license_list_version,
123 imported_license_index_provenance,
124 ) = loaded.into_parts()?;
125 (
126 process_result,
127 directories_count,
128 assembly_result,
129 license_detections,
130 license_references,
131 license_rule_references,
132 extra_errors,
133 Vec::new(),
134 imported_spdx_license_list_version,
135 imported_license_index_provenance,
136 None,
137 )
138 } else {
139 let NativeScanSelection {
140 scan_path,
141 selected_paths,
142 collection_frontier,
143 missing_entries: missing_paths_file_entries,
144 } = resolve_native_scan_selection(&cli)?;
145 let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
146 for warning in &paths_file_warnings {
147 progress.output_written(warning);
148 }
149
150 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
151 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
152 shared_cache_config = Some(cache_config.clone());
153 let collection_exclude_patterns =
154 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
155
156 let mut collected = if cli.paths_file.is_empty() {
157 collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns)
158 } else {
159 collect_selected_paths(
160 Path::new(&scan_path),
161 &collection_frontier,
162 cli.max_depth,
163 &collection_exclude_patterns,
164 )
165 };
166 let user_excluded_count = apply_user_path_filters_to_collected(
167 &mut collected,
168 Path::new(&scan_path),
169 &selected_paths,
170 &cli.include,
171 &cli.exclude,
172 );
173 let total_files = collected.file_count();
174 let total_dirs = collected.directory_count();
175 let total_size = collected.total_file_bytes;
176 let excluded_count = collected.excluded_count + user_excluded_count;
177 let all_collected_files = collected.files.clone();
178 let ordered_file_paths: Vec<PathBuf> = collected
179 .files
180 .iter()
181 .map(|(path, _)| path.clone())
182 .collect();
183 let runtime_errors = collected
184 .collection_errors
185 .iter()
186 .map(|(path, err)| format_default_scan_error(path, err))
187 .collect();
188 for (path, err) in &collected.collection_errors {
189 progress.record_runtime_error(path, err);
190 }
191 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
192 if !cli.quiet {
193 progress.output_written(&format!(
194 "Found {} files in {} directories ({} items excluded)",
195 total_files, total_dirs, excluded_count
196 ));
197 }
198
199 let license_engine = if cli.license {
200 progress.start_setup();
201 progress.start_license_detection_engine_creation();
202 let engine = init_license_engine(
203 shared_cache_config
204 .as_ref()
205 .expect("cache config should be prepared before license engine init"),
206 &cli,
207 )?;
208 progress.finish_license_detection_engine_creation("setup_scan:licenses");
209 progress.finish_setup();
210 progress.output_written(&describe_license_engine_source(
211 &engine,
212 cli.license_dataset_path.as_deref(),
213 ));
214 Some(engine)
215 } else {
216 None
217 };
218
219 let enable_application_packages = cli.package || cli.package_only;
220 let enable_system_packages = cli.system_package || cli.package_only;
221 let enable_packages =
222 enable_application_packages || enable_system_packages || cli.package_in_compiled;
223 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
224 {
225 (false, cli.email, cli.url, cli.generated)
226 } else {
227 (cli.copyright, cli.email, cli.url, cli.generated)
228 };
229 let process_mode = cli.processes;
230
231 let text_options = TextDetectionOptions {
232 collect_info: cli.info,
233 detect_packages: enable_packages,
234 detect_application_packages: enable_application_packages,
235 detect_system_packages: enable_system_packages,
236 detect_packages_in_compiled: cli.package_in_compiled,
237 detect_copyrights,
238 detect_generated,
239 detect_emails,
240 detect_urls,
241 max_emails: cli.max_email,
242 max_urls: cli.max_url,
243 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
244 };
245
246 let license_options = LicenseScanOptions {
247 include_text: cli.license_text,
248 include_text_diagnostics: cli.license_text_diagnostics,
249 include_diagnostics: cli.license_diagnostics,
250 unknown_licenses: cli.unknown_licenses,
251 min_score: cli.license_score,
252 };
253 let options_fingerprint =
254 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
255
256 if cli.incremental {
257 let manifest_path = incremental_manifest_path(
258 cache_config.root_dir(),
259 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
260 );
261 let previous_manifest =
262 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
263 let reused_files = partition_incremental_files(
264 &mut collected.files,
265 Path::new(&scan_path),
266 previous_manifest.as_ref(),
267 );
268 progress.record_incremental_reused(reused_files.len());
269 }
270
271 if let Some(message) = process_mode_message(process_mode) {
272 progress.output_written(message);
273 }
274 progress.start_scan(collected.file_count());
275 let mut result = match process_mode {
276 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
277 Ok(process_collected_with_memory_limit(
278 &collected,
279 Arc::clone(&progress),
280 license_engine.clone(),
281 license_options,
282 &text_options,
283 cli.max_in_memory,
284 ))
285 })?,
286 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
287 process_collected_with_memory_limit_sequential(
288 &collected,
289 Arc::clone(&progress),
290 license_engine.clone(),
291 license_options,
292 &text_options,
293 cli.max_in_memory,
294 )
295 }
296 };
297
298 if cli.incremental {
299 let manifest_path = incremental_manifest_path(
300 cache_config.root_dir(),
301 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
302 );
303 let reused_files = partition_incremental_files(
304 &mut all_collected_files.clone(),
305 Path::new(&scan_path),
306 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
307 );
308 result.files =
309 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
310
311 let manifest = build_incremental_manifest(
312 Path::new(&scan_path),
313 &all_collected_files,
314 &result.files,
315 &options_fingerprint,
316 );
317 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
318 }
319
320 result.excluded_count = excluded_count;
321 progress.finish_scan();
322
323 (
324 result,
325 total_dirs,
326 assembly::AssemblyResult {
327 packages: Vec::new(),
328 dependencies: Vec::new(),
329 },
330 Vec::new(),
331 Vec::new(),
332 Vec::new(),
333 runtime_errors,
334 paths_file_warnings,
335 None,
336 None,
337 license_engine,
338 )
339 };
340
341 progress.start_post_scan();
342
343 if cli.filter_clues {
344 progress.post_scan_step("Filtering redundant clues...");
345 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
346 prepare_filter_clue_rule_lookup(
347 &scan_result.files,
348 active_license_engine.as_deref(),
349 cli.license_dataset_path.as_deref(),
350 shared_license_cache_config.as_ref(),
351 )
352 })?;
353 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
354 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
355 } else {
356 filter_redundant_clues(&mut scan_result.files);
357 }
358 }
359
360 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
361 progress.post_scan_step("Applying ignore-resource filters...");
362 record_detail_timing(&progress, "post-scan:ignore-resource", || {
363 apply_ignore_resource_filter(
364 &mut scan_result.files,
365 &ignore_copyright_holder_patterns,
366 &ignore_author_patterns,
367 );
368 });
369 }
370
371 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
372 progress.post_scan_step("Applying path selection filters...");
373 record_detail_timing(&progress, "output-filter:path-selection", || {
374 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
375 });
376 }
377
378 if cli.only_findings {
379 progress.post_scan_step("Filtering to files with findings...");
380 record_detail_timing(&progress, "output-filter:only-findings", || {
381 apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
382 });
383 }
384
385 if cli.info && cli.mark_source {
386 progress.post_scan_step("Marking source files...");
387 record_detail_timing(&progress, "post-scan:mark-source", || {
388 apply_mark_source(&mut scan_result.files);
389 });
390 }
391
392 if should_include_info_surface(&scan_result.files, &cli) {
393 progress.post_scan_step("Populating info resource counts...");
394 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
395 populate_info_resource_counts(&mut scan_result.files);
396 });
397 }
398
399 progress.post_scan_step("Backfilling license provenance...");
400 record_detail_timing(&progress, "post-scan:license-provenance", || {
401 for file in &mut scan_result.files {
402 file.backfill_license_provenance();
403 }
404 });
405
406 if cli.from_json {
407 for err in &preloaded_extra_errors {
408 progress.record_additional_error(err);
409 }
410 }
411
412 let mut extra_errors = preloaded_extra_errors;
413 if let Some(policy_path) = cli.license_policy.as_deref() {
414 progress.post_scan_step("Applying license policy...");
415 let license_policy_errors =
416 record_detail_timing(&progress, "post-scan:license-policy", || {
417 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
418 })?;
419 for err in &license_policy_errors {
420 progress.record_additional_error(err);
421 }
422 extra_errors.extend(license_policy_errors);
423 }
424
425 if cli.from_json {
426 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
427 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
428 trim_preloaded_assembly_to_files(
429 &scan_result.files,
430 &mut preloaded_assembly.packages,
431 &mut preloaded_assembly.dependencies,
432 );
433 });
434 }
435
436 progress.finish_post_scan();
437
438 let manifests_seen = scan_result
439 .files
440 .iter()
441 .map(|file| file.package_data.len())
442 .sum();
443 let skip_assembly = cli.no_assemble || cli.package_only;
444
445 let mut assembly_result = if skip_assembly {
446 assembly::AssemblyResult {
447 packages: Vec::new(),
448 dependencies: Vec::new(),
449 }
450 } else {
451 progress.start_assembly();
452
453 let mut result = if cli.from_json
454 && (!preloaded_assembly.packages.is_empty()
455 || !preloaded_assembly.dependencies.is_empty())
456 {
457 progress.assembly_step("Using preloaded assembly...");
458 preloaded_assembly
459 } else {
460 assembly::assemble(&mut scan_result.files)
461 };
462
463 progress.assembly_step("Backfilling package license provenance...");
464 record_detail_timing(&progress, "assembly:package-license-provenance", || {
465 for package in &mut result.packages {
466 package.backfill_license_provenance();
467 }
468 });
469
470 progress.assembly_step("Applying package reference following...");
471 record_detail_timing(&progress, "assembly:package-reference-following", || {
472 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
473 });
474
475 progress.finish_assembly(result.packages.len(), manifests_seen);
476 result
477 };
478
479 progress.start_finalize();
480
481 if !cli.from_json && (cli.strip_root || cli.full_root) {
482 let root_path = cli
483 .dir_path
484 .first()
485 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
486 progress.finalize_step("Normalizing paths...");
487 record_detail_timing(&progress, "finalize:path-normalization", || {
488 normalize_paths(
489 &mut scan_result.files,
490 root_path,
491 cli.strip_root,
492 cli.full_root,
493 );
494 normalize_top_level_output_paths(
495 &mut assembly_result.packages,
496 &mut assembly_result.dependencies,
497 root_path,
498 cli.strip_root,
499 );
500 });
501 }
502
503 progress.finalize_step("Collecting license detections...");
504 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
505 let preserve_preloaded_top_level_detections = cli.from_json
506 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
507 collect_top_level_license_detections_for_mode(
508 &scan_result.files,
509 preloaded_license_detections,
510 preserve_preloaded_top_level_detections,
511 cli.from_json && cli.dir_path.len() > 1,
512 )
513 });
514
515 let should_recompute_license_references = cli.from_json
516 && (!preloaded_license_references.is_empty()
517 || !preloaded_license_rule_references.is_empty()
518 || cli.license_references
519 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
520 && !preloaded_license_references.is_empty()));
521
522 if should_recompute_license_references && active_license_engine.is_none() {
523 progress.start_license_detection_engine_creation();
524 active_license_engine = Some(init_license_engine(
525 shared_cache_config
526 .as_ref()
527 .expect("cache config should be prepared before license engine init"),
528 &cli,
529 )?);
530 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
531 }
532
533 progress.finalize_step("Collecting license references...");
534 let (license_references, license_rule_references) =
535 record_detail_timing(&progress, "finalize:license-references", || {
536 if cli.from_json && !should_recompute_license_references {
537 (
538 preloaded_license_references,
539 preloaded_license_rule_references,
540 )
541 } else if cli.license_references || should_recompute_license_references {
542 if let Some(engine) = active_license_engine.as_deref() {
543 collect_top_level_license_references(
544 &scan_result.files,
545 &assembly_result.packages,
546 engine.index(),
547 &cli.license_url_template,
548 )
549 } else {
550 (Vec::new(), Vec::new())
551 }
552 } else {
553 (Vec::new(), Vec::new())
554 }
555 });
556
557 let end_time = Utc::now();
558 let spdx_license_list_version = active_license_engine
559 .as_ref()
560 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
561 .or(imported_spdx_license_list_version)
562 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
563 let license_index_provenance = active_license_engine
564 .as_ref()
565 .and_then(|engine| engine.license_index_provenance().cloned())
566 .or(imported_license_index_provenance);
567
568 progress.finalize_step("Preparing output...");
569 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
570 create_output(
571 start_time,
572 end_time,
573 scan_result,
574 CreateOutputContext {
575 total_dirs,
576 assembly_result,
577 license_detections,
578 license_references,
579 license_rule_references,
580 spdx_license_list_version,
581 license_index_provenance,
582 extra_errors,
583 extra_warnings,
584 header_options: cli.output_header_options(),
585 options: CreateOutputOptions {
586 facet_rules: &facet_rules,
587 include_classify: cli.classify,
588 include_summary: cli.summary,
589 include_license_clarity_score: cli.license_clarity_score,
590 include_tallies: cli.tallies,
591 include_tallies_of_key_files: cli.tallies_key_files,
592 include_tallies_with_details: cli.tallies_with_details,
593 include_tallies_by_facet: cli.tallies_by_facet,
594 include_generated: cli.generated,
595 verbose: cli.verbose,
596 },
597 },
598 )
599 });
600 progress.finish_finalize();
601
602 let output_schema_output = crate::output_schema::Output::from(&output);
603 progress.start_output();
604 for target in cli.output_targets() {
605 let output_config = OutputWriteConfig {
606 format: target.format,
607 custom_template: target.custom_template.clone(),
608 scanned_path: if cli.dir_path.len() == 1 {
609 cli.dir_path.first().cloned()
610 } else {
611 None
612 },
613 };
614
615 let timing_name = format!("output:{:?}", target.format).to_lowercase();
616 record_detail_timing(&progress, timing_name, || {
617 write_output_file(&target.file, &output_schema_output, &output_config)
618 })?;
619 progress.output_written(&format!(
620 "{:?} output written to {}",
621 target.format, target.file
622 ));
623 }
624 progress.record_final_counts(&output.files);
625 progress.record_final_header_counts(&output.headers);
626 progress.finish_output();
627
628 let summary_end = Utc::now();
629 progress.display_summary(
630 &format_scancode_timestamp(&start_time),
631 &format_scancode_timestamp(&summary_end),
632 );
633
634 Ok(())
635}
636
637fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
638 if from_json {
639 files.clear();
640 } else {
641 apply_only_findings_filter(files);
642 }
643}
644
645fn collect_top_level_license_detections_for_mode(
646 files: &[FileInfo],
647 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
648 preserve_preloaded: bool,
649 clear_for_multi_input_replay: bool,
650) -> Vec<crate::models::TopLevelLicenseDetection> {
651 if clear_for_multi_input_replay {
652 Vec::new()
653 } else if preserve_preloaded {
654 preloaded
655 } else {
656 collect_top_level_license_detections(files)
657 }
658}
659
660#[cfg(feature = "golden-tests")]
661fn touch_license_golden_symbols() {
662 let _ = crate::license_detection::golden_utils::read_golden_input_content;
663 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
664 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
665 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
666}
667
668#[derive(Debug)]
669struct NativeScanSelection {
670 scan_path: String,
671 selected_paths: Vec<SelectedPath>,
672 collection_frontier: Vec<CollectionFrontier>,
673 missing_entries: Vec<String>,
674}
675
676fn resolve_native_scan_selection(cli: &Cli) -> Result<NativeScanSelection> {
677 if cli.paths_file.is_empty() {
678 let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
679 return Ok(NativeScanSelection {
680 scan_path,
681 selected_paths,
682 collection_frontier: Vec::new(),
683 missing_entries: Vec::new(),
684 });
685 }
686
687 let scan_path = cli
688 .dir_path
689 .first()
690 .cloned()
691 .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
692 let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
693 let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
694 if resolved.selections.is_empty() {
695 return Err(anyhow!(
696 "--paths-file did not resolve to any existing files or directories under {:?}",
697 Path::new(&scan_path)
698 ));
699 }
700
701 Ok(NativeScanSelection {
702 scan_path,
703 selected_paths: resolved.selections,
704 collection_frontier: resolved.frontier,
705 missing_entries: resolved.missing_entries,
706 })
707}
708
709fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
710 let mut entries = Vec::new();
711 for paths_file in paths_files {
712 let content = read_paths_file_content(paths_file)?;
713 entries.extend(content.lines().map(ToOwned::to_owned));
714 }
715 Ok(entries)
716}
717
718fn read_paths_file_content(paths_file: &str) -> Result<String> {
719 if paths_file == "-" {
720 let mut content = String::new();
721 std::io::stdin()
722 .read_to_string(&mut content)
723 .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
724 return Ok(content);
725 }
726
727 fs::read_to_string(paths_file)
728 .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
729}
730
731fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
732 missing_entries
733 .iter()
734 .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
735 .collect()
736}
737
738fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
739 if cli.show_attribution {
740 return Ok(());
741 }
742
743 if cli.export_license_dataset.is_some() {
744 if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() {
745 return Err(anyhow!(
746 "--export-license-dataset does not accept scan input paths or --paths-file"
747 ));
748 }
749
750 if cli.from_json
751 || cli.license
752 || cli.package
753 || cli.system_package
754 || cli.package_in_compiled
755 || cli.package_only
756 || cli.copyright
757 || cli.email
758 || cli.url
759 || cli.generated
760 || cli.info
761 || cli.incremental
762 || cli.reindex
763 || cli.no_license_index_cache
764 || cli.license_dataset_path.is_some()
765 {
766 return Err(anyhow!(
767 "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
768 ));
769 }
770
771 return Ok(());
772 }
773
774 if cli.from_json
775 && (cli.package
776 || cli.system_package
777 || cli.package_in_compiled
778 || cli.package_only
779 || cli.copyright
780 || cli.email
781 || cli.url
782 || cli.generated)
783 {
784 return Err(anyhow!(
785 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
786 ));
787 }
788
789 if cli.from_json && !cli.paths_file.is_empty() {
790 return Err(anyhow!(
791 "--paths-file is only supported for native scan mode, not --from-json"
792 ));
793 }
794
795 if cli.from_json && cli.incremental {
796 return Err(anyhow!(
797 "--incremental is only supported for directory scan mode, not --from-json"
798 ));
799 }
800
801 if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
802 return Err(anyhow!(
803 "--paths-file requires exactly one positional scan root"
804 ));
805 }
806
807 if !cli.from_json && cli.dir_path.is_empty() {
808 return Err(anyhow!("Directory path is required for scan operations"));
809 }
810
811 if cli.tallies_by_facet && cli.facet.is_empty() {
812 return Err(anyhow!(
813 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
814 ));
815 }
816
817 if cli.mark_source && !cli.info {
818 return Err(anyhow!("--mark-source requires --info"));
819 }
820
821 Ok(())
822}
823
824fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
825 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
826 let config = CacheConfig::from_overrides(
827 scan_root,
828 cli.cache_dir.as_deref().map(Path::new),
829 env_cache_dir.as_deref(),
830 cli.incremental,
831 );
832
833 if cli.cache_clear {
834 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
835 config.clear_contents()
836 })?;
837 }
838
839 if config.incremental_enabled() {
840 config.ensure_dirs()?;
841 }
842
843 Ok(config)
844}
845
846fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
847 LicenseCacheConfig::new(
848 cache_root.root_dir().to_path_buf(),
849 cli.reindex,
850 !cli.no_license_index_cache,
851 )
852}
853
854fn partition_incremental_files(
855 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
856 scan_root: &Path,
857 manifest: Option<&IncrementalManifest>,
858) -> Vec<FileInfo> {
859 let Some(manifest) = manifest else {
860 return Vec::new();
861 };
862
863 let mut files_to_scan = Vec::new();
864 let mut reused_files = Vec::new();
865
866 for (path, metadata) in collected_files.drain(..) {
867 let relative_path = normalize_relative_scan_path(&path, scan_root);
868 let Some(entry) = manifest.entry(&relative_path) else {
869 files_to_scan.push((path, metadata));
870 continue;
871 };
872
873 match manifest_entry_matches_path(entry, &path, &metadata) {
874 Ok(true) => reused_files.push(entry.file_info.clone()),
875 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
876 }
877 }
878
879 *collected_files = files_to_scan;
880 reused_files
881}
882
883fn merge_incremental_file_results(
884 processed_files: Vec<FileInfo>,
885 reused_files: Vec<FileInfo>,
886 ordered_file_paths: &[PathBuf],
887) -> Vec<FileInfo> {
888 let mut processed_file_entries = HashMap::new();
889 let mut directory_entries = Vec::new();
890 for file in processed_files {
891 if file.file_type == FileType::File {
892 processed_file_entries.insert(file.path.clone(), file);
893 } else {
894 directory_entries.push(file);
895 }
896 }
897
898 let mut reused_file_entries: HashMap<_, _> = reused_files
899 .into_iter()
900 .map(|file| (file.path.clone(), file))
901 .collect();
902
903 let mut merged_files = Vec::new();
904 for path in ordered_file_paths {
905 let path_string = path.to_string_lossy().to_string();
906 if let Some(file) = processed_file_entries.remove(&path_string) {
907 merged_files.push(file);
908 continue;
909 }
910
911 if let Some(file) = reused_file_entries.remove(&path_string) {
912 merged_files.push(file);
913 }
914 }
915
916 merged_files.extend(processed_file_entries.into_values());
917 merged_files.extend(reused_file_entries.into_values());
918 merged_files.extend(directory_entries);
919 merged_files
920}
921
922fn build_incremental_manifest(
923 scan_root: &Path,
924 collected_files: &[(PathBuf, fs::Metadata)],
925 files: &[FileInfo],
926 options_fingerprint: &str,
927) -> IncrementalManifest {
928 let files_by_relative_path: HashMap<_, _> = files
929 .iter()
930 .filter(|file| file.file_type == FileType::File)
931 .map(|file| {
932 (
933 normalize_relative_scan_path(Path::new(&file.path), scan_root),
934 file.clone(),
935 )
936 })
937 .collect();
938
939 let entries = collected_files
940 .iter()
941 .filter_map(|(path, metadata)| {
942 let relative_path = normalize_relative_scan_path(path, scan_root);
943 let state = metadata_fingerprint(metadata)?;
944 let file_info = files_by_relative_path.get(&relative_path)?.clone();
945 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
946 fs::read(path)
947 .map(|bytes| calculate_sha256(&bytes))
948 .unwrap_or_else(|_| {
949 Sha256Digest::from_hex(
950 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
951 )
952 .unwrap()
953 })
954 });
955 Some((
956 relative_path,
957 IncrementalManifestEntry {
958 state,
959 content_sha256,
960 file_info,
961 },
962 ))
963 })
964 .collect::<BTreeMap<_, _>>();
965
966 IncrementalManifest::new(options_fingerprint.to_string(), entries)
967}
968
969fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
970 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
971 calculate_sha256(
972 format!(
973 "{}\n{options_fingerprint}",
974 canonical_root.to_string_lossy()
975 )
976 .as_bytes(),
977 )
978 .as_hex()
979}
980
981fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
982 path.strip_prefix(scan_root)
983 .unwrap_or(path)
984 .to_string_lossy()
985 .replace('\\', "/")
986}
987
988fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
989 patterns
990 .iter()
991 .map(|pattern| {
992 Regex::new(pattern).map_err(|err| {
993 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
994 })
995 })
996 .collect()
997}
998
999fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
1000 match process_mode {
1001 ProcessMode::SequentialWithoutTimeouts => 0.0,
1002 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
1003 }
1004}
1005
1006fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
1007 match process_mode {
1008 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
1009 ProcessMode::SequentialWithoutTimeouts => {
1010 Some("Disabling multi-processing and multi-threading for debugging.")
1011 }
1012 ProcessMode::Parallel(_) => None,
1013 }
1014}
1015
1016fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
1017 if cli.quiet {
1018 ProgressMode::Quiet
1019 } else if cli.verbose {
1020 ProgressMode::Verbose
1021 } else {
1022 ProgressMode::Default
1023 }
1024}
1025
1026fn configured_scan_names(cli: &Cli) -> String {
1027 let mut names = Vec::new();
1028 if cli.license {
1029 names.push("licenses");
1030 }
1031 if cli.info {
1032 names.push("info");
1033 }
1034 if cli.package {
1035 names.push("packages");
1036 }
1037 if (cli.system_package || cli.package_in_compiled || cli.package_only)
1038 && !names.contains(&"packages")
1039 {
1040 names.push("packages");
1041 }
1042 if cli.copyright {
1043 names.push("copyrights");
1044 }
1045 if cli.email {
1046 names.push("emails");
1047 }
1048 if cli.url {
1049 names.push("urls");
1050 }
1051 names.join(", ")
1052}
1053
1054fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
1055 cli.info
1056 || files.iter().any(|file| {
1057 file.date.is_some()
1058 || file.sha1.is_some()
1059 || file.md5.is_some()
1060 || file.sha256.is_some()
1061 || file.sha1_git.is_some()
1062 || file.mime_type.is_some()
1063 || file.file_type_label.is_some()
1064 || file.programming_language.is_some()
1065 || file.is_binary.is_some()
1066 || file.is_text.is_some()
1067 || file.is_archive.is_some()
1068 || file.is_media.is_some()
1069 || file.is_source.is_some()
1070 || file.is_script.is_some()
1071 || file.files_count.is_some()
1072 || file.dirs_count.is_some()
1073 || file.size_count.is_some()
1074 })
1075}
1076
1077fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1078where
1079 F: FnOnce() -> T,
1080{
1081 let started = Instant::now();
1082 let result = f();
1083 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1084 result
1085}
1086
1087fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1088where
1089 F: FnOnce() -> Result<T> + Send,
1090 T: Send,
1091{
1092 let pool = rayon::ThreadPoolBuilder::new()
1093 .num_threads(threads.max(1))
1094 .build()?;
1095 pool.install(f)
1096}
1097
1098fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
1099 let cache_config = build_license_cache_config(cache_root, cli);
1100
1101 match &cli.license_dataset_path {
1102 Some(p) => {
1103 let path = PathBuf::from(p);
1104 if !path.exists() {
1105 return Err(anyhow!("License dataset path does not exist: {:?}", path));
1106 }
1107 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1108 Ok(Arc::new(engine))
1109 }
1110 None => {
1111 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1112 Ok(Arc::new(engine))
1113 }
1114 }
1115}
1116
1117fn describe_license_engine_source(
1118 engine: &LicenseDetectionEngine,
1119 rules_path: Option<&str>,
1120) -> String {
1121 match rules_path {
1122 Some(path) => format!(
1123 "License detection engine initialized with {} rules from custom dataset {}",
1124 engine.index().rules_by_rid.len(),
1125 path
1126 ),
1127 None => format!(
1128 "License detection engine initialized with {} rules from embedded artifact",
1129 engine.index().rules_by_rid.len()
1130 ),
1131 }
1132}
1133
1134#[cfg(test)]
1135mod tests;