1use crate::assembly;
5use crate::cache::{
6 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, Command, ProcessMode, ScanArgs};
11use crate::compare::compare_json_files;
12use crate::license_detection::LicenseDetectionEngine;
13use crate::license_detection::dataset::export_embedded_license_dataset;
14use crate::license_detection::license_cache::LicenseCacheConfig;
15use crate::models::{FileInfo, FileType, Sha256Digest};
16use crate::output::{OutputWriteConfig, write_output_file};
17use crate::post_processing::{
18 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
19 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
20 collect_top_level_license_detections, collect_top_level_license_references, create_output,
21};
22use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
23use crate::scan_result_shaping::{
24 SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
25 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
26 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
27 normalize_top_level_output_paths, populate_info_resource_counts,
28 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
29 trim_preloaded_assembly_to_files,
30};
31use crate::scanner::{
32 CollectionFrontier, LicenseScanOptions, TextDetectionOptions, collect_paths,
33 collect_selected_paths, process_collected_with_memory_limit,
34 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
35};
36use crate::time::format_scancode_timestamp;
37use crate::utils::hash::calculate_sha256;
38use anyhow::{Result, anyhow};
39use chrono::Utc;
40use regex::Regex;
41use std::collections::{BTreeMap, HashMap};
42use std::env;
43use std::fs;
44use std::io::Read;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use std::time::Instant;
48
49pub fn run() -> Result<()> {
50 #[cfg(feature = "golden-tests")]
51 touch_license_golden_symbols();
52
53 let cli = Cli::parse();
54 match &cli.command {
55 Command::ShowAttribution => {
56 print!("{}", include_str!("../../../NOTICE"));
57 return Ok(());
58 }
59 Command::Compare(args) => {
60 let result = compare_json_files(
61 &args.scancode_json,
62 &args.provenant_json,
63 args.artifact_dir.as_deref(),
64 )?;
65 println!("Comparison status: {}", result.comparison_status);
66 println!("Artifacts:");
67 println!(" Artifact directory: {}", result.artifact_dir.display());
68 println!(" Run manifest: {}", result.manifest_path.display());
69 println!(" Raw ScanCode JSON: {}", result.scancode_json.display());
70 println!(" Raw Provenant JSON: {}", result.provenant_json.display());
71 println!(" Summary JSON: {}", result.summary_json.display());
72 println!(" Summary TSV: {}", result.summary_tsv.display());
73 println!(" Sample artifacts: {}", result.samples_dir.display());
74 return Ok(());
75 }
76 Command::ExportLicenseDataset(args) => {
77 export_embedded_license_dataset(Path::new(&args.dir))?;
78 return Ok(());
79 }
80 Command::Scan(_) => {}
81 }
82
83 let cli = cli
84 .scan_args()
85 .expect("scan arguments should exist after command dispatch");
86
87 validate_scan_option_compatibility(cli)?;
88
89 let start_time = Utc::now();
90 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(cli)));
91 progress.set_processes(cli.processes);
92 progress.set_scan_names(configured_scan_names(cli));
93 progress.init_logging_bridge();
94 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
95
96 progress.start_setup();
97 let facet_rules = build_facet_rules(&cli.facet)?;
98
99 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
100 let ignore_copyright_holder_patterns =
101 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
102 progress.finish_setup();
103
104 progress.start_discovery();
105
106 let mut shared_cache_config = if cli.from_json {
107 let cache_config = prepare_cache_config(None, cli)?;
108 shared_license_cache_config = Some(build_license_cache_config(&cache_config, cli));
109 Some(cache_config)
110 } else {
111 None
112 };
113
114 let (
115 mut scan_result,
116 total_dirs,
117 mut preloaded_assembly,
118 preloaded_license_detections,
119 preloaded_license_references,
120 preloaded_license_rule_references,
121 preloaded_extra_errors,
122 extra_warnings,
123 imported_spdx_license_list_version,
124 imported_license_index_provenance,
125 mut active_license_engine,
126 ) = if cli.from_json {
127 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
128 let directories_count = loaded.directory_count();
129 let files_count = loaded.file_count();
130 let size_count = loaded.file_size_count();
131 progress.finish_discovery(
132 files_count,
133 directories_count,
134 size_count,
135 loaded.excluded_count,
136 );
137 let (
138 process_result,
139 assembly_result,
140 license_detections,
141 license_references,
142 license_rule_references,
143 extra_errors,
144 imported_spdx_license_list_version,
145 imported_license_index_provenance,
146 ) = loaded.into_parts()?;
147 (
148 process_result,
149 directories_count,
150 assembly_result,
151 license_detections,
152 license_references,
153 license_rule_references,
154 extra_errors,
155 Vec::new(),
156 imported_spdx_license_list_version,
157 imported_license_index_provenance,
158 None,
159 )
160 } else {
161 let NativeScanSelection {
162 scan_path,
163 selected_paths,
164 collection_frontier,
165 missing_entries: missing_paths_file_entries,
166 } = resolve_native_scan_selection(cli)?;
167 let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
168 for warning in &paths_file_warnings {
169 progress.output_written(warning);
170 }
171
172 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), cli)?;
173 shared_license_cache_config = Some(build_license_cache_config(&cache_config, cli));
174 shared_cache_config = Some(cache_config.clone());
175 let collection_exclude_patterns =
176 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
177
178 let mut collected = if cli.paths_file.is_empty() {
179 collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns)
180 } else {
181 collect_selected_paths(
182 Path::new(&scan_path),
183 &collection_frontier,
184 cli.max_depth,
185 &collection_exclude_patterns,
186 )
187 };
188 let user_excluded_count = apply_user_path_filters_to_collected(
189 &mut collected,
190 Path::new(&scan_path),
191 &selected_paths,
192 &cli.include,
193 &cli.exclude,
194 );
195 let total_files = collected.file_count();
196 let total_dirs = collected.directory_count();
197 let total_size = collected.total_file_bytes;
198 let excluded_count = collected.excluded_count + user_excluded_count;
199 let all_collected_files = collected.files.clone();
200 let ordered_file_paths: Vec<PathBuf> = collected
201 .files
202 .iter()
203 .map(|(path, _)| path.clone())
204 .collect();
205 let runtime_errors = collected
206 .collection_errors
207 .iter()
208 .map(|(path, err)| format_default_scan_error(path, err))
209 .collect();
210 for (path, err) in &collected.collection_errors {
211 progress.record_runtime_error(path, err);
212 }
213 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
214 if !cli.quiet {
215 progress.output_written(&format!(
216 "Found {} files in {} directories ({} items excluded)",
217 total_files, total_dirs, excluded_count
218 ));
219 }
220
221 let license_engine = if cli.license {
222 progress.start_setup();
223 progress.start_license_detection_engine_creation();
224 let engine = init_license_engine(
225 shared_cache_config
226 .as_ref()
227 .expect("cache config should be prepared before license engine init"),
228 cli,
229 )?;
230 progress.finish_license_detection_engine_creation("setup_scan:licenses");
231 progress.finish_setup();
232 progress.output_written(&describe_license_engine_source(
233 &engine,
234 cli.license_dataset_path.as_deref(),
235 ));
236 Some(engine)
237 } else {
238 None
239 };
240
241 let enable_application_packages = cli.package || cli.package_only;
242 let enable_system_packages = cli.system_package || cli.package_only;
243 let enable_packages =
244 enable_application_packages || enable_system_packages || cli.package_in_compiled;
245 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
246 {
247 (false, cli.email, cli.url, cli.generated)
248 } else {
249 (cli.copyright, cli.email, cli.url, cli.generated)
250 };
251 let process_mode = cli.processes;
252
253 let text_options = TextDetectionOptions {
254 collect_info: cli.info,
255 detect_packages: enable_packages,
256 detect_application_packages: enable_application_packages,
257 detect_system_packages: enable_system_packages,
258 detect_packages_in_compiled: cli.package_in_compiled,
259 detect_copyrights,
260 detect_generated,
261 detect_emails,
262 detect_urls,
263 max_emails: cli.max_email,
264 max_urls: cli.max_url,
265 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
266 };
267
268 let license_options = LicenseScanOptions {
269 include_text: cli.license_text,
270 include_text_diagnostics: cli.license_text_diagnostics,
271 include_diagnostics: cli.license_diagnostics,
272 unknown_licenses: cli.unknown_licenses,
273 min_score: cli.license_score,
274 };
275 let options_fingerprint =
276 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
277
278 if cli.incremental {
279 let manifest_path = incremental_manifest_path(
280 cache_config.root_dir(),
281 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
282 );
283 let previous_manifest =
284 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
285 let reused_files = partition_incremental_files(
286 &mut collected.files,
287 Path::new(&scan_path),
288 previous_manifest.as_ref(),
289 );
290 progress.record_incremental_reused(reused_files.len());
291 }
292
293 if let Some(message) = process_mode_message(process_mode) {
294 progress.output_written(message);
295 }
296 progress.start_scan(collected.file_count());
297 let mut result = match process_mode {
298 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
299 Ok(process_collected_with_memory_limit(
300 &collected,
301 Arc::clone(&progress),
302 license_engine.clone(),
303 license_options,
304 &text_options,
305 cli.max_in_memory,
306 ))
307 })?,
308 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
309 process_collected_with_memory_limit_sequential(
310 &collected,
311 Arc::clone(&progress),
312 license_engine.clone(),
313 license_options,
314 &text_options,
315 cli.max_in_memory,
316 )
317 }
318 };
319
320 if cli.incremental {
321 let manifest_path = incremental_manifest_path(
322 cache_config.root_dir(),
323 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
324 );
325 let reused_files = partition_incremental_files(
326 &mut all_collected_files.clone(),
327 Path::new(&scan_path),
328 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
329 );
330 result.files =
331 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
332
333 let manifest = build_incremental_manifest(
334 Path::new(&scan_path),
335 &all_collected_files,
336 &result.files,
337 &options_fingerprint,
338 );
339 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
340 }
341
342 result.excluded_count = excluded_count;
343 progress.finish_scan();
344
345 (
346 result,
347 total_dirs,
348 assembly::AssemblyResult {
349 packages: Vec::new(),
350 dependencies: Vec::new(),
351 },
352 Vec::new(),
353 Vec::new(),
354 Vec::new(),
355 runtime_errors,
356 paths_file_warnings,
357 None,
358 None,
359 license_engine,
360 )
361 };
362
363 progress.start_post_scan();
364
365 if cli.filter_clues {
366 progress.post_scan_step("Filtering redundant clues...");
367 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
368 prepare_filter_clue_rule_lookup(
369 &scan_result.files,
370 active_license_engine.as_deref(),
371 cli.license_dataset_path.as_deref(),
372 shared_license_cache_config.as_ref(),
373 )
374 })?;
375 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
376 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
377 } else {
378 filter_redundant_clues(&mut scan_result.files);
379 }
380 }
381
382 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
383 progress.post_scan_step("Applying ignore-resource filters...");
384 record_detail_timing(&progress, "post-scan:ignore-resource", || {
385 apply_ignore_resource_filter(
386 &mut scan_result.files,
387 &ignore_copyright_holder_patterns,
388 &ignore_author_patterns,
389 );
390 });
391 }
392
393 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
394 progress.post_scan_step("Applying path selection filters...");
395 record_detail_timing(&progress, "output-filter:path-selection", || {
396 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
397 });
398 }
399
400 if cli.only_findings {
401 progress.post_scan_step("Filtering to resources with findings...");
402 record_detail_timing(&progress, "output-filter:only-findings", || {
403 apply_only_findings_filter(&mut scan_result.files);
404 });
405 }
406
407 if cli.info && cli.mark_source {
408 progress.post_scan_step("Marking source files...");
409 record_detail_timing(&progress, "post-scan:mark-source", || {
410 apply_mark_source(&mut scan_result.files);
411 });
412 }
413
414 if should_include_info_surface(&scan_result.files, cli) {
415 progress.post_scan_step("Populating info resource counts...");
416 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
417 populate_info_resource_counts(&mut scan_result.files);
418 });
419 }
420
421 progress.post_scan_step("Backfilling license provenance...");
422 record_detail_timing(&progress, "post-scan:license-provenance", || {
423 for file in &mut scan_result.files {
424 file.backfill_license_provenance();
425 }
426 });
427
428 if cli.from_json {
429 for err in &preloaded_extra_errors {
430 progress.record_additional_error(err);
431 }
432 }
433
434 let mut extra_errors = preloaded_extra_errors;
435 if let Some(policy_path) = cli.license_policy.as_deref() {
436 progress.post_scan_step("Applying license policy...");
437 let license_policy_errors =
438 record_detail_timing(&progress, "post-scan:license-policy", || {
439 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
440 })?;
441 for err in &license_policy_errors {
442 progress.record_additional_error(err);
443 }
444 extra_errors.extend(license_policy_errors);
445 }
446
447 if cli.from_json {
448 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
449 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
450 trim_preloaded_assembly_to_files(
451 &scan_result.files,
452 &mut preloaded_assembly.packages,
453 &mut preloaded_assembly.dependencies,
454 );
455 });
456 }
457
458 progress.finish_post_scan();
459
460 let manifests_seen = scan_result
461 .files
462 .iter()
463 .map(|file| file.package_data.len())
464 .sum();
465 let skip_assembly = cli.no_assemble || cli.package_only;
466
467 let mut assembly_result = if skip_assembly {
468 assembly::AssemblyResult {
469 packages: Vec::new(),
470 dependencies: Vec::new(),
471 }
472 } else {
473 progress.start_assembly();
474
475 let mut result = if cli.from_json
476 && (!preloaded_assembly.packages.is_empty()
477 || !preloaded_assembly.dependencies.is_empty())
478 {
479 progress.assembly_step("Using preloaded assembly...");
480 preloaded_assembly
481 } else {
482 assembly::assemble(&mut scan_result.files)
483 };
484
485 progress.assembly_step("Backfilling package license provenance...");
486 record_detail_timing(&progress, "assembly:package-license-provenance", || {
487 for package in &mut result.packages {
488 package.backfill_license_provenance();
489 }
490 });
491
492 progress.assembly_step("Applying package reference following...");
493 record_detail_timing(&progress, "assembly:package-reference-following", || {
494 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
495 });
496
497 progress.finish_assembly(result.packages.len(), manifests_seen);
498 result
499 };
500
501 progress.start_finalize();
502
503 if !cli.from_json && (cli.strip_root || cli.full_root) {
504 let root_path = cli
505 .dir_path
506 .first()
507 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
508 progress.finalize_step("Normalizing paths...");
509 record_detail_timing(&progress, "finalize:path-normalization", || {
510 normalize_paths(
511 &mut scan_result.files,
512 root_path,
513 cli.strip_root,
514 cli.full_root,
515 );
516 normalize_top_level_output_paths(
517 &mut assembly_result.packages,
518 &mut assembly_result.dependencies,
519 root_path,
520 cli.strip_root,
521 );
522 });
523 }
524
525 progress.finalize_step("Collecting license detections...");
526 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
527 let preserve_preloaded_top_level_detections = cli.from_json
528 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
529 collect_top_level_license_detections_for_mode(
530 &scan_result.files,
531 preloaded_license_detections,
532 preserve_preloaded_top_level_detections,
533 cli.from_json && cli.dir_path.len() > 1,
534 )
535 });
536
537 let should_recompute_license_references = cli.from_json
538 && (!preloaded_license_references.is_empty()
539 || !preloaded_license_rule_references.is_empty()
540 || cli.license_references
541 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
542 && !preloaded_license_references.is_empty()));
543
544 if should_recompute_license_references && active_license_engine.is_none() {
545 progress.start_license_detection_engine_creation();
546 active_license_engine = Some(init_license_engine(
547 shared_cache_config
548 .as_ref()
549 .expect("cache config should be prepared before license engine init"),
550 cli,
551 )?);
552 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
553 }
554
555 progress.finalize_step("Collecting license references...");
556 let (license_references, license_rule_references) =
557 record_detail_timing(&progress, "finalize:license-references", || {
558 if cli.from_json && !should_recompute_license_references {
559 (
560 preloaded_license_references,
561 preloaded_license_rule_references,
562 )
563 } else if cli.license_references || should_recompute_license_references {
564 if let Some(engine) = active_license_engine.as_deref() {
565 collect_top_level_license_references(
566 &scan_result.files,
567 &assembly_result.packages,
568 engine.index(),
569 &cli.license_url_template,
570 )
571 } else {
572 (Vec::new(), Vec::new())
573 }
574 } else {
575 (Vec::new(), Vec::new())
576 }
577 });
578
579 let end_time = Utc::now();
580 let spdx_license_list_version = active_license_engine
581 .as_ref()
582 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
583 .or(imported_spdx_license_list_version)
584 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
585 let license_index_provenance = active_license_engine
586 .as_ref()
587 .and_then(|engine| engine.license_index_provenance().cloned())
588 .or(imported_license_index_provenance);
589
590 progress.finalize_step("Preparing output...");
591 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
592 create_output(
593 start_time,
594 end_time,
595 scan_result,
596 CreateOutputContext {
597 total_dirs,
598 assembly_result,
599 license_detections,
600 license_references,
601 license_rule_references,
602 spdx_license_list_version,
603 license_index_provenance,
604 extra_errors,
605 extra_warnings,
606 header_options: cli.output_header_options(),
607 options: CreateOutputOptions {
608 facet_rules: &facet_rules,
609 include_classify: cli.classify,
610 include_summary: cli.summary,
611 include_license_clarity_score: cli.license_clarity_score,
612 include_tallies: cli.tallies,
613 include_tallies_of_key_files: cli.tallies_key_files,
614 include_tallies_with_details: cli.tallies_with_details,
615 include_tallies_by_facet: cli.tallies_by_facet,
616 include_generated: cli.generated,
617 verbose: cli.verbose,
618 },
619 },
620 )
621 });
622 progress.finish_finalize();
623
624 let output_schema_output = crate::output_schema::Output::from(&output);
625 progress.start_output();
626 for target in cli.output_targets() {
627 let output_config = OutputWriteConfig {
628 format: target.format,
629 custom_template: target.custom_template.clone(),
630 scanned_path: if cli.dir_path.len() == 1 {
631 cli.dir_path.first().cloned()
632 } else {
633 None
634 },
635 };
636
637 let timing_name = format!("output:{:?}", target.format).to_lowercase();
638 record_detail_timing(&progress, timing_name, || {
639 write_output_file(&target.file, &output_schema_output, &output_config)
640 })?;
641 progress.output_written(&format!(
642 "{:?} output written to {}",
643 target.format, target.file
644 ));
645 }
646 progress.record_final_counts(&output.files);
647 progress.record_final_header_counts(&output.headers);
648 progress.finish_output();
649
650 let summary_end = Utc::now();
651 progress.display_summary(
652 &format_scancode_timestamp(&start_time),
653 &format_scancode_timestamp(&summary_end),
654 );
655
656 Ok(())
657}
658
659fn collect_top_level_license_detections_for_mode(
660 files: &[FileInfo],
661 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
662 preserve_preloaded: bool,
663 clear_for_multi_input_replay: bool,
664) -> Vec<crate::models::TopLevelLicenseDetection> {
665 if clear_for_multi_input_replay {
666 Vec::new()
667 } else if preserve_preloaded {
668 preloaded
669 } else {
670 collect_top_level_license_detections(files)
671 }
672}
673
674#[cfg(feature = "golden-tests")]
675fn touch_license_golden_symbols() {
676 let _ = crate::license_detection::golden_utils::read_golden_input_content;
677 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
678 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
679 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
680}
681
682#[derive(Debug)]
683struct NativeScanSelection {
684 scan_path: String,
685 selected_paths: Vec<SelectedPath>,
686 collection_frontier: Vec<CollectionFrontier>,
687 missing_entries: Vec<String>,
688}
689
690fn resolve_native_scan_selection(cli: &ScanArgs) -> Result<NativeScanSelection> {
691 if cli.paths_file.is_empty() {
692 let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
693 return Ok(NativeScanSelection {
694 scan_path,
695 selected_paths,
696 collection_frontier: Vec::new(),
697 missing_entries: Vec::new(),
698 });
699 }
700
701 let scan_path = cli
702 .dir_path
703 .first()
704 .cloned()
705 .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
706 let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
707 let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
708 if resolved.selections.is_empty() {
709 return Err(anyhow!(
710 "--paths-file did not resolve to any existing files or directories under {:?}",
711 Path::new(&scan_path)
712 ));
713 }
714
715 Ok(NativeScanSelection {
716 scan_path,
717 selected_paths: resolved.selections,
718 collection_frontier: resolved.frontier,
719 missing_entries: resolved.missing_entries,
720 })
721}
722
723fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
724 let mut entries = Vec::new();
725 for paths_file in paths_files {
726 let content = read_paths_file_content(paths_file)?;
727 entries.extend(content.lines().map(ToOwned::to_owned));
728 }
729 Ok(entries)
730}
731
732fn read_paths_file_content(paths_file: &str) -> Result<String> {
733 if paths_file == "-" {
734 let mut content = String::new();
735 std::io::stdin()
736 .read_to_string(&mut content)
737 .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
738 return Ok(content);
739 }
740
741 fs::read_to_string(paths_file)
742 .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
743}
744
745fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
746 missing_entries
747 .iter()
748 .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
749 .collect()
750}
751
752fn validate_scan_option_compatibility(cli: &ScanArgs) -> Result<()> {
753 if cli.from_json
754 && (cli.package
755 || cli.system_package
756 || cli.package_in_compiled
757 || cli.package_only
758 || cli.copyright
759 || cli.email
760 || cli.url
761 || cli.generated)
762 {
763 return Err(anyhow!(
764 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
765 ));
766 }
767
768 if cli.from_json && !cli.paths_file.is_empty() {
769 return Err(anyhow!(
770 "--paths-file is only supported for native scan mode, not --from-json"
771 ));
772 }
773
774 if cli.from_json && cli.incremental {
775 return Err(anyhow!(
776 "--incremental is only supported for directory scan mode, not --from-json"
777 ));
778 }
779
780 if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
781 return Err(anyhow!(
782 "--paths-file requires exactly one positional scan root"
783 ));
784 }
785
786 if !cli.from_json && cli.dir_path.is_empty() {
787 return Err(anyhow!("Directory path is required for scan operations"));
788 }
789
790 if cli.tallies_by_facet && cli.facet.is_empty() {
791 return Err(anyhow!(
792 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
793 ));
794 }
795
796 if cli.mark_source && !cli.info {
797 return Err(anyhow!("--mark-source requires --info"));
798 }
799
800 Ok(())
801}
802
803fn prepare_cache_config(scan_root: Option<&Path>, cli: &ScanArgs) -> Result<CacheConfig> {
804 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
805 let config = CacheConfig::from_overrides(
806 scan_root,
807 cli.cache_dir.as_deref().map(Path::new),
808 env_cache_dir.as_deref(),
809 cli.incremental,
810 );
811
812 if cli.cache_clear {
813 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
814 config.clear_contents()
815 })?;
816 }
817
818 if config.incremental_enabled() {
819 config.ensure_dirs()?;
820 }
821
822 Ok(config)
823}
824
825fn build_license_cache_config(cache_root: &CacheConfig, cli: &ScanArgs) -> LicenseCacheConfig {
826 LicenseCacheConfig::new(
827 cache_root.root_dir().to_path_buf(),
828 cli.reindex,
829 !cli.no_license_index_cache,
830 )
831}
832
833fn partition_incremental_files(
834 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
835 scan_root: &Path,
836 manifest: Option<&IncrementalManifest>,
837) -> Vec<FileInfo> {
838 let Some(manifest) = manifest else {
839 return Vec::new();
840 };
841
842 let mut files_to_scan = Vec::new();
843 let mut reused_files = Vec::new();
844
845 for (path, metadata) in collected_files.drain(..) {
846 let relative_path = normalize_relative_scan_path(&path, scan_root);
847 let Some(entry) = manifest.entry(&relative_path) else {
848 files_to_scan.push((path, metadata));
849 continue;
850 };
851
852 match manifest_entry_matches_path(entry, &path, &metadata) {
853 Ok(true) => reused_files.push(entry.file_info.clone()),
854 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
855 }
856 }
857
858 *collected_files = files_to_scan;
859 reused_files
860}
861
862fn merge_incremental_file_results(
863 processed_files: Vec<FileInfo>,
864 reused_files: Vec<FileInfo>,
865 ordered_file_paths: &[PathBuf],
866) -> Vec<FileInfo> {
867 let mut processed_file_entries = HashMap::new();
868 let mut directory_entries = Vec::new();
869 for file in processed_files {
870 if file.file_type == FileType::File {
871 processed_file_entries.insert(file.path.clone(), file);
872 } else {
873 directory_entries.push(file);
874 }
875 }
876
877 let mut reused_file_entries: HashMap<_, _> = reused_files
878 .into_iter()
879 .map(|file| (file.path.clone(), file))
880 .collect();
881
882 let mut merged_files = Vec::new();
883 for path in ordered_file_paths {
884 let path_string = path.to_string_lossy().to_string();
885 if let Some(file) = processed_file_entries.remove(&path_string) {
886 merged_files.push(file);
887 continue;
888 }
889
890 if let Some(file) = reused_file_entries.remove(&path_string) {
891 merged_files.push(file);
892 }
893 }
894
895 merged_files.extend(processed_file_entries.into_values());
896 merged_files.extend(reused_file_entries.into_values());
897 merged_files.extend(directory_entries);
898 merged_files
899}
900
901fn build_incremental_manifest(
902 scan_root: &Path,
903 collected_files: &[(PathBuf, fs::Metadata)],
904 files: &[FileInfo],
905 options_fingerprint: &str,
906) -> IncrementalManifest {
907 let files_by_relative_path: HashMap<_, _> = files
908 .iter()
909 .filter(|file| file.file_type == FileType::File)
910 .map(|file| {
911 (
912 normalize_relative_scan_path(Path::new(&file.path), scan_root),
913 file.clone(),
914 )
915 })
916 .collect();
917
918 let entries = collected_files
919 .iter()
920 .filter_map(|(path, metadata)| {
921 let relative_path = normalize_relative_scan_path(path, scan_root);
922 let state = metadata_fingerprint(metadata)?;
923 let file_info = files_by_relative_path.get(&relative_path)?.clone();
924 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
925 fs::read(path)
926 .map(|bytes| calculate_sha256(&bytes))
927 .unwrap_or_else(|_| {
928 Sha256Digest::from_hex(
929 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
930 )
931 .unwrap()
932 })
933 });
934 Some((
935 relative_path,
936 IncrementalManifestEntry {
937 state,
938 content_sha256,
939 file_info,
940 },
941 ))
942 })
943 .collect::<BTreeMap<_, _>>();
944
945 IncrementalManifest::new(options_fingerprint.to_string(), entries)
946}
947
948fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
949 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
950 calculate_sha256(
951 format!(
952 "{}\n{options_fingerprint}",
953 canonical_root.to_string_lossy()
954 )
955 .as_bytes(),
956 )
957 .as_hex()
958}
959
960fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
961 path.strip_prefix(scan_root)
962 .unwrap_or(path)
963 .to_string_lossy()
964 .replace('\\', "/")
965}
966
967fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
968 patterns
969 .iter()
970 .map(|pattern| {
971 Regex::new(pattern).map_err(|err| {
972 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
973 })
974 })
975 .collect()
976}
977
978fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
979 match process_mode {
980 ProcessMode::SequentialWithoutTimeouts => 0.0,
981 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
982 }
983}
984
985fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
986 match process_mode {
987 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
988 ProcessMode::SequentialWithoutTimeouts => {
989 Some("Disabling multi-processing and multi-threading for debugging.")
990 }
991 ProcessMode::Parallel(_) => None,
992 }
993}
994
995fn progress_mode_from_cli(cli: &ScanArgs) -> ProgressMode {
996 if cli.quiet {
997 ProgressMode::Quiet
998 } else if cli.verbose {
999 ProgressMode::Verbose
1000 } else {
1001 ProgressMode::Default
1002 }
1003}
1004
1005fn configured_scan_names(cli: &ScanArgs) -> String {
1006 let mut names = Vec::new();
1007 if cli.license {
1008 names.push("licenses");
1009 }
1010 if cli.info {
1011 names.push("info");
1012 }
1013 if cli.package {
1014 names.push("packages");
1015 }
1016 if (cli.system_package || cli.package_in_compiled || cli.package_only)
1017 && !names.contains(&"packages")
1018 {
1019 names.push("packages");
1020 }
1021 if cli.copyright {
1022 names.push("copyrights");
1023 }
1024 if cli.email {
1025 names.push("emails");
1026 }
1027 if cli.url {
1028 names.push("urls");
1029 }
1030 names.join(", ")
1031}
1032
1033fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &ScanArgs) -> bool {
1034 cli.info
1035 || files.iter().any(|file| {
1036 file.date.is_some()
1037 || file.sha1.is_some()
1038 || file.md5.is_some()
1039 || file.sha256.is_some()
1040 || file.sha1_git.is_some()
1041 || file.mime_type.is_some()
1042 || file.file_type_label.is_some()
1043 || file.programming_language.is_some()
1044 || file.is_binary.is_some()
1045 || file.is_text.is_some()
1046 || file.is_archive.is_some()
1047 || file.is_media.is_some()
1048 || file.is_source.is_some()
1049 || file.is_script.is_some()
1050 || file.files_count.is_some()
1051 || file.dirs_count.is_some()
1052 || file.size_count.is_some()
1053 })
1054}
1055
1056fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1057where
1058 F: FnOnce() -> T,
1059{
1060 let started = Instant::now();
1061 let result = f();
1062 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1063 result
1064}
1065
1066fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1067where
1068 F: FnOnce() -> Result<T> + Send,
1069 T: Send,
1070{
1071 let pool = rayon::ThreadPoolBuilder::new()
1072 .num_threads(threads.max(1))
1073 .build()?;
1074 pool.install(f)
1075}
1076
1077fn init_license_engine(
1078 cache_root: &CacheConfig,
1079 cli: &ScanArgs,
1080) -> Result<Arc<LicenseDetectionEngine>> {
1081 let cache_config = build_license_cache_config(cache_root, cli);
1082
1083 match &cli.license_dataset_path {
1084 Some(p) => {
1085 let path = PathBuf::from(p);
1086 if !path.exists() {
1087 return Err(anyhow!("License dataset path does not exist: {:?}", path));
1088 }
1089 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1090 Ok(Arc::new(engine))
1091 }
1092 None => {
1093 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1094 Ok(Arc::new(engine))
1095 }
1096 }
1097}
1098
1099fn describe_license_engine_source(
1100 engine: &LicenseDetectionEngine,
1101 rules_path: Option<&str>,
1102) -> String {
1103 match rules_path {
1104 Some(path) => format!(
1105 "License detection engine initialized with {} rules from custom dataset {}",
1106 engine.index().rules_by_rid.len(),
1107 path
1108 ),
1109 None => format!(
1110 "License detection engine initialized with {} rules from embedded artifact",
1111 engine.index().rules_by_rid.len()
1112 ),
1113 }
1114}
1115
1116#[cfg(test)]
1117mod tests;