1use crate::assembly;
5use crate::cache::{
6 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19 collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23 SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26 normalize_top_level_output_paths, populate_info_resource_counts,
27 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
28 trim_preloaded_assembly_to_files,
29};
30use crate::scanner::{
31 CollectionFrontier, LicenseScanOptions, TextDetectionOptions, collect_paths,
32 collect_selected_paths, process_collected_with_memory_limit,
33 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
34};
35use crate::time::format_scancode_timestamp;
36use crate::utils::hash::calculate_sha256;
37use anyhow::{Result, anyhow};
38use chrono::Utc;
39use clap::Parser;
40use regex::Regex;
41use std::collections::{BTreeMap, HashMap};
42use std::env;
43use std::fs;
44use std::io::Read;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use std::time::Instant;
48
49pub fn run() -> Result<()> {
50 #[cfg(feature = "golden-tests")]
51 touch_license_golden_symbols();
52
53 let cli = Cli::parse();
54
55 validate_scan_option_compatibility(&cli)?;
56
57 if cli.show_attribution {
58 print!("{}", include_str!("../../../NOTICE"));
59 return Ok(());
60 }
61
62 if let Some(export_dir) = cli.export_license_dataset.as_deref() {
63 export_embedded_license_dataset(Path::new(export_dir))?;
64 return Ok(());
65 }
66
67 let start_time = Utc::now();
68 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
69 progress.set_processes(cli.processes);
70 progress.set_scan_names(configured_scan_names(&cli));
71 progress.init_logging_bridge();
72 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
73
74 progress.start_setup();
75 let facet_rules = build_facet_rules(&cli.facet)?;
76
77 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
78 let ignore_copyright_holder_patterns =
79 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
80 progress.finish_setup();
81
82 progress.start_discovery();
83
84 let mut shared_cache_config = if cli.from_json {
85 let cache_config = prepare_cache_config(None, &cli)?;
86 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
87 Some(cache_config)
88 } else {
89 None
90 };
91
92 let (
93 mut scan_result,
94 total_dirs,
95 mut preloaded_assembly,
96 preloaded_license_detections,
97 preloaded_license_references,
98 preloaded_license_rule_references,
99 preloaded_extra_errors,
100 extra_warnings,
101 imported_spdx_license_list_version,
102 imported_license_index_provenance,
103 mut active_license_engine,
104 ) = if cli.from_json {
105 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
106 let directories_count = loaded.directory_count();
107 let files_count = loaded.file_count();
108 let size_count = loaded.file_size_count();
109 progress.finish_discovery(
110 files_count,
111 directories_count,
112 size_count,
113 loaded.excluded_count,
114 );
115 let (
116 process_result,
117 assembly_result,
118 license_detections,
119 license_references,
120 license_rule_references,
121 extra_errors,
122 imported_spdx_license_list_version,
123 imported_license_index_provenance,
124 ) = loaded.into_parts()?;
125 (
126 process_result,
127 directories_count,
128 assembly_result,
129 license_detections,
130 license_references,
131 license_rule_references,
132 extra_errors,
133 Vec::new(),
134 imported_spdx_license_list_version,
135 imported_license_index_provenance,
136 None,
137 )
138 } else {
139 let NativeScanSelection {
140 scan_path,
141 selected_paths,
142 collection_frontier,
143 missing_entries: missing_paths_file_entries,
144 } = resolve_native_scan_selection(&cli)?;
145 let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
146 for warning in &paths_file_warnings {
147 progress.output_written(warning);
148 }
149
150 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
151 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
152 shared_cache_config = Some(cache_config.clone());
153 let collection_exclude_patterns =
154 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
155
156 let mut collected = if cli.paths_file.is_empty() {
157 collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns)
158 } else {
159 collect_selected_paths(
160 Path::new(&scan_path),
161 &collection_frontier,
162 cli.max_depth,
163 &collection_exclude_patterns,
164 )
165 };
166 let user_excluded_count = apply_user_path_filters_to_collected(
167 &mut collected,
168 Path::new(&scan_path),
169 &selected_paths,
170 &cli.include,
171 &cli.exclude,
172 );
173 let total_files = collected.file_count();
174 let total_dirs = collected.directory_count();
175 let total_size = collected.total_file_bytes;
176 let excluded_count = collected.excluded_count + user_excluded_count;
177 let all_collected_files = collected.files.clone();
178 let ordered_file_paths: Vec<PathBuf> = collected
179 .files
180 .iter()
181 .map(|(path, _)| path.clone())
182 .collect();
183 let runtime_errors = collected
184 .collection_errors
185 .iter()
186 .map(|(path, err)| format_default_scan_error(path, err))
187 .collect();
188 for (path, err) in &collected.collection_errors {
189 progress.record_runtime_error(path, err);
190 }
191 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
192 if !cli.quiet {
193 progress.output_written(&format!(
194 "Found {} files in {} directories ({} items excluded)",
195 total_files, total_dirs, excluded_count
196 ));
197 }
198
199 let license_engine = if cli.license {
200 progress.start_setup();
201 progress.start_license_detection_engine_creation();
202 let engine = init_license_engine(
203 shared_cache_config
204 .as_ref()
205 .expect("cache config should be prepared before license engine init"),
206 &cli,
207 )?;
208 progress.finish_license_detection_engine_creation("setup_scan:licenses");
209 progress.finish_setup();
210 progress.output_written(&describe_license_engine_source(
211 &engine,
212 cli.license_dataset_path.as_deref(),
213 ));
214 Some(engine)
215 } else {
216 None
217 };
218
219 let enable_application_packages = cli.package || cli.package_only;
220 let enable_system_packages = cli.system_package || cli.package_only;
221 let enable_packages =
222 enable_application_packages || enable_system_packages || cli.package_in_compiled;
223 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
224 {
225 (false, cli.email, cli.url, cli.generated)
226 } else {
227 (cli.copyright, cli.email, cli.url, cli.generated)
228 };
229 let process_mode = cli.processes;
230
231 let text_options = TextDetectionOptions {
232 collect_info: cli.info,
233 detect_packages: enable_packages,
234 detect_application_packages: enable_application_packages,
235 detect_system_packages: enable_system_packages,
236 detect_packages_in_compiled: cli.package_in_compiled,
237 detect_copyrights,
238 detect_generated,
239 detect_emails,
240 detect_urls,
241 max_emails: cli.max_email,
242 max_urls: cli.max_url,
243 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
244 };
245
246 let license_options = LicenseScanOptions {
247 include_text: cli.license_text,
248 include_text_diagnostics: cli.license_text_diagnostics,
249 include_diagnostics: cli.license_diagnostics,
250 unknown_licenses: cli.unknown_licenses,
251 min_score: cli.license_score,
252 };
253 let options_fingerprint =
254 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
255
256 if cli.incremental {
257 let manifest_path = incremental_manifest_path(
258 cache_config.root_dir(),
259 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
260 );
261 let previous_manifest =
262 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
263 let reused_files = partition_incremental_files(
264 &mut collected.files,
265 Path::new(&scan_path),
266 previous_manifest.as_ref(),
267 );
268 progress.record_incremental_reused(reused_files.len());
269 }
270
271 if let Some(message) = process_mode_message(process_mode) {
272 progress.output_written(message);
273 }
274 progress.start_scan(collected.file_count());
275 let mut result = match process_mode {
276 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
277 Ok(process_collected_with_memory_limit(
278 &collected,
279 Arc::clone(&progress),
280 license_engine.clone(),
281 license_options,
282 &text_options,
283 cli.max_in_memory,
284 ))
285 })?,
286 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
287 process_collected_with_memory_limit_sequential(
288 &collected,
289 Arc::clone(&progress),
290 license_engine.clone(),
291 license_options,
292 &text_options,
293 cli.max_in_memory,
294 )
295 }
296 };
297
298 if cli.incremental {
299 let manifest_path = incremental_manifest_path(
300 cache_config.root_dir(),
301 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
302 );
303 let reused_files = partition_incremental_files(
304 &mut all_collected_files.clone(),
305 Path::new(&scan_path),
306 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
307 );
308 result.files =
309 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
310
311 let manifest = build_incremental_manifest(
312 Path::new(&scan_path),
313 &all_collected_files,
314 &result.files,
315 &options_fingerprint,
316 );
317 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
318 }
319
320 result.excluded_count = excluded_count;
321 progress.finish_scan();
322
323 (
324 result,
325 total_dirs,
326 assembly::AssemblyResult {
327 packages: Vec::new(),
328 dependencies: Vec::new(),
329 },
330 Vec::new(),
331 Vec::new(),
332 Vec::new(),
333 runtime_errors,
334 paths_file_warnings,
335 None,
336 None,
337 license_engine,
338 )
339 };
340
341 progress.start_post_scan();
342
343 if cli.filter_clues {
344 progress.post_scan_step("Filtering redundant clues...");
345 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
346 prepare_filter_clue_rule_lookup(
347 &scan_result.files,
348 active_license_engine.as_deref(),
349 cli.license_dataset_path.as_deref(),
350 shared_license_cache_config.as_ref(),
351 )
352 })?;
353 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
354 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
355 } else {
356 filter_redundant_clues(&mut scan_result.files);
357 }
358 }
359
360 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
361 progress.post_scan_step("Applying ignore-resource filters...");
362 record_detail_timing(&progress, "post-scan:ignore-resource", || {
363 apply_ignore_resource_filter(
364 &mut scan_result.files,
365 &ignore_copyright_holder_patterns,
366 &ignore_author_patterns,
367 );
368 });
369 }
370
371 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
372 progress.post_scan_step("Applying path selection filters...");
373 record_detail_timing(&progress, "output-filter:path-selection", || {
374 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
375 });
376 }
377
378 if cli.only_findings {
379 progress.post_scan_step("Filtering to resources with findings...");
380 record_detail_timing(&progress, "output-filter:only-findings", || {
381 apply_only_findings_filter(&mut scan_result.files);
382 });
383 }
384
385 if cli.info && cli.mark_source {
386 progress.post_scan_step("Marking source files...");
387 record_detail_timing(&progress, "post-scan:mark-source", || {
388 apply_mark_source(&mut scan_result.files);
389 });
390 }
391
392 if should_include_info_surface(&scan_result.files, &cli) {
393 progress.post_scan_step("Populating info resource counts...");
394 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
395 populate_info_resource_counts(&mut scan_result.files);
396 });
397 }
398
399 progress.post_scan_step("Backfilling license provenance...");
400 record_detail_timing(&progress, "post-scan:license-provenance", || {
401 for file in &mut scan_result.files {
402 file.backfill_license_provenance();
403 }
404 });
405
406 if cli.from_json {
407 for err in &preloaded_extra_errors {
408 progress.record_additional_error(err);
409 }
410 }
411
412 let mut extra_errors = preloaded_extra_errors;
413 if let Some(policy_path) = cli.license_policy.as_deref() {
414 progress.post_scan_step("Applying license policy...");
415 let license_policy_errors =
416 record_detail_timing(&progress, "post-scan:license-policy", || {
417 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
418 })?;
419 for err in &license_policy_errors {
420 progress.record_additional_error(err);
421 }
422 extra_errors.extend(license_policy_errors);
423 }
424
425 if cli.from_json {
426 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
427 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
428 trim_preloaded_assembly_to_files(
429 &scan_result.files,
430 &mut preloaded_assembly.packages,
431 &mut preloaded_assembly.dependencies,
432 );
433 });
434 }
435
436 progress.finish_post_scan();
437
438 let manifests_seen = scan_result
439 .files
440 .iter()
441 .map(|file| file.package_data.len())
442 .sum();
443 let skip_assembly = cli.no_assemble || cli.package_only;
444
445 let mut assembly_result = if skip_assembly {
446 assembly::AssemblyResult {
447 packages: Vec::new(),
448 dependencies: Vec::new(),
449 }
450 } else {
451 progress.start_assembly();
452
453 let mut result = if cli.from_json
454 && (!preloaded_assembly.packages.is_empty()
455 || !preloaded_assembly.dependencies.is_empty())
456 {
457 progress.assembly_step("Using preloaded assembly...");
458 preloaded_assembly
459 } else {
460 assembly::assemble(&mut scan_result.files)
461 };
462
463 progress.assembly_step("Backfilling package license provenance...");
464 record_detail_timing(&progress, "assembly:package-license-provenance", || {
465 for package in &mut result.packages {
466 package.backfill_license_provenance();
467 }
468 });
469
470 progress.assembly_step("Applying package reference following...");
471 record_detail_timing(&progress, "assembly:package-reference-following", || {
472 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
473 });
474
475 progress.finish_assembly(result.packages.len(), manifests_seen);
476 result
477 };
478
479 progress.start_finalize();
480
481 if !cli.from_json && (cli.strip_root || cli.full_root) {
482 let root_path = cli
483 .dir_path
484 .first()
485 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
486 progress.finalize_step("Normalizing paths...");
487 record_detail_timing(&progress, "finalize:path-normalization", || {
488 normalize_paths(
489 &mut scan_result.files,
490 root_path,
491 cli.strip_root,
492 cli.full_root,
493 );
494 normalize_top_level_output_paths(
495 &mut assembly_result.packages,
496 &mut assembly_result.dependencies,
497 root_path,
498 cli.strip_root,
499 );
500 });
501 }
502
503 progress.finalize_step("Collecting license detections...");
504 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
505 let preserve_preloaded_top_level_detections = cli.from_json
506 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
507 collect_top_level_license_detections_for_mode(
508 &scan_result.files,
509 preloaded_license_detections,
510 preserve_preloaded_top_level_detections,
511 cli.from_json && cli.dir_path.len() > 1,
512 )
513 });
514
515 let should_recompute_license_references = cli.from_json
516 && (!preloaded_license_references.is_empty()
517 || !preloaded_license_rule_references.is_empty()
518 || cli.license_references
519 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
520 && !preloaded_license_references.is_empty()));
521
522 if should_recompute_license_references && active_license_engine.is_none() {
523 progress.start_license_detection_engine_creation();
524 active_license_engine = Some(init_license_engine(
525 shared_cache_config
526 .as_ref()
527 .expect("cache config should be prepared before license engine init"),
528 &cli,
529 )?);
530 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
531 }
532
533 progress.finalize_step("Collecting license references...");
534 let (license_references, license_rule_references) =
535 record_detail_timing(&progress, "finalize:license-references", || {
536 if cli.from_json && !should_recompute_license_references {
537 (
538 preloaded_license_references,
539 preloaded_license_rule_references,
540 )
541 } else if cli.license_references || should_recompute_license_references {
542 if let Some(engine) = active_license_engine.as_deref() {
543 collect_top_level_license_references(
544 &scan_result.files,
545 &assembly_result.packages,
546 engine.index(),
547 &cli.license_url_template,
548 )
549 } else {
550 (Vec::new(), Vec::new())
551 }
552 } else {
553 (Vec::new(), Vec::new())
554 }
555 });
556
557 let end_time = Utc::now();
558 let spdx_license_list_version = active_license_engine
559 .as_ref()
560 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
561 .or(imported_spdx_license_list_version)
562 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
563 let license_index_provenance = active_license_engine
564 .as_ref()
565 .and_then(|engine| engine.license_index_provenance().cloned())
566 .or(imported_license_index_provenance);
567
568 progress.finalize_step("Preparing output...");
569 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
570 create_output(
571 start_time,
572 end_time,
573 scan_result,
574 CreateOutputContext {
575 total_dirs,
576 assembly_result,
577 license_detections,
578 license_references,
579 license_rule_references,
580 spdx_license_list_version,
581 license_index_provenance,
582 extra_errors,
583 extra_warnings,
584 header_options: cli.output_header_options(),
585 options: CreateOutputOptions {
586 facet_rules: &facet_rules,
587 include_classify: cli.classify,
588 include_summary: cli.summary,
589 include_license_clarity_score: cli.license_clarity_score,
590 include_tallies: cli.tallies,
591 include_tallies_of_key_files: cli.tallies_key_files,
592 include_tallies_with_details: cli.tallies_with_details,
593 include_tallies_by_facet: cli.tallies_by_facet,
594 include_generated: cli.generated,
595 verbose: cli.verbose,
596 },
597 },
598 )
599 });
600 progress.finish_finalize();
601
602 let output_schema_output = crate::output_schema::Output::from(&output);
603 progress.start_output();
604 for target in cli.output_targets() {
605 let output_config = OutputWriteConfig {
606 format: target.format,
607 custom_template: target.custom_template.clone(),
608 scanned_path: if cli.dir_path.len() == 1 {
609 cli.dir_path.first().cloned()
610 } else {
611 None
612 },
613 };
614
615 let timing_name = format!("output:{:?}", target.format).to_lowercase();
616 record_detail_timing(&progress, timing_name, || {
617 write_output_file(&target.file, &output_schema_output, &output_config)
618 })?;
619 progress.output_written(&format!(
620 "{:?} output written to {}",
621 target.format, target.file
622 ));
623 }
624 progress.record_final_counts(&output.files);
625 progress.record_final_header_counts(&output.headers);
626 progress.finish_output();
627
628 let summary_end = Utc::now();
629 progress.display_summary(
630 &format_scancode_timestamp(&start_time),
631 &format_scancode_timestamp(&summary_end),
632 );
633
634 Ok(())
635}
636
637fn collect_top_level_license_detections_for_mode(
638 files: &[FileInfo],
639 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
640 preserve_preloaded: bool,
641 clear_for_multi_input_replay: bool,
642) -> Vec<crate::models::TopLevelLicenseDetection> {
643 if clear_for_multi_input_replay {
644 Vec::new()
645 } else if preserve_preloaded {
646 preloaded
647 } else {
648 collect_top_level_license_detections(files)
649 }
650}
651
652#[cfg(feature = "golden-tests")]
653fn touch_license_golden_symbols() {
654 let _ = crate::license_detection::golden_utils::read_golden_input_content;
655 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
656 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
657 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
658}
659
660#[derive(Debug)]
661struct NativeScanSelection {
662 scan_path: String,
663 selected_paths: Vec<SelectedPath>,
664 collection_frontier: Vec<CollectionFrontier>,
665 missing_entries: Vec<String>,
666}
667
668fn resolve_native_scan_selection(cli: &Cli) -> Result<NativeScanSelection> {
669 if cli.paths_file.is_empty() {
670 let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
671 return Ok(NativeScanSelection {
672 scan_path,
673 selected_paths,
674 collection_frontier: Vec::new(),
675 missing_entries: Vec::new(),
676 });
677 }
678
679 let scan_path = cli
680 .dir_path
681 .first()
682 .cloned()
683 .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
684 let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
685 let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
686 if resolved.selections.is_empty() {
687 return Err(anyhow!(
688 "--paths-file did not resolve to any existing files or directories under {:?}",
689 Path::new(&scan_path)
690 ));
691 }
692
693 Ok(NativeScanSelection {
694 scan_path,
695 selected_paths: resolved.selections,
696 collection_frontier: resolved.frontier,
697 missing_entries: resolved.missing_entries,
698 })
699}
700
701fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
702 let mut entries = Vec::new();
703 for paths_file in paths_files {
704 let content = read_paths_file_content(paths_file)?;
705 entries.extend(content.lines().map(ToOwned::to_owned));
706 }
707 Ok(entries)
708}
709
710fn read_paths_file_content(paths_file: &str) -> Result<String> {
711 if paths_file == "-" {
712 let mut content = String::new();
713 std::io::stdin()
714 .read_to_string(&mut content)
715 .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
716 return Ok(content);
717 }
718
719 fs::read_to_string(paths_file)
720 .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
721}
722
723fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
724 missing_entries
725 .iter()
726 .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
727 .collect()
728}
729
730fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
731 if cli.show_attribution {
732 return Ok(());
733 }
734
735 if cli.export_license_dataset.is_some() {
736 if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() {
737 return Err(anyhow!(
738 "--export-license-dataset does not accept scan input paths or --paths-file"
739 ));
740 }
741
742 if cli.from_json
743 || cli.license
744 || cli.package
745 || cli.system_package
746 || cli.package_in_compiled
747 || cli.package_only
748 || cli.copyright
749 || cli.email
750 || cli.url
751 || cli.generated
752 || cli.info
753 || cli.incremental
754 || cli.reindex
755 || cli.no_license_index_cache
756 || cli.license_dataset_path.is_some()
757 {
758 return Err(anyhow!(
759 "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
760 ));
761 }
762
763 return Ok(());
764 }
765
766 if cli.from_json
767 && (cli.package
768 || cli.system_package
769 || cli.package_in_compiled
770 || cli.package_only
771 || cli.copyright
772 || cli.email
773 || cli.url
774 || cli.generated)
775 {
776 return Err(anyhow!(
777 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
778 ));
779 }
780
781 if cli.from_json && !cli.paths_file.is_empty() {
782 return Err(anyhow!(
783 "--paths-file is only supported for native scan mode, not --from-json"
784 ));
785 }
786
787 if cli.from_json && cli.incremental {
788 return Err(anyhow!(
789 "--incremental is only supported for directory scan mode, not --from-json"
790 ));
791 }
792
793 if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
794 return Err(anyhow!(
795 "--paths-file requires exactly one positional scan root"
796 ));
797 }
798
799 if !cli.from_json && cli.dir_path.is_empty() {
800 return Err(anyhow!("Directory path is required for scan operations"));
801 }
802
803 if cli.tallies_by_facet && cli.facet.is_empty() {
804 return Err(anyhow!(
805 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
806 ));
807 }
808
809 if cli.mark_source && !cli.info {
810 return Err(anyhow!("--mark-source requires --info"));
811 }
812
813 Ok(())
814}
815
816fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
817 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
818 let config = CacheConfig::from_overrides(
819 scan_root,
820 cli.cache_dir.as_deref().map(Path::new),
821 env_cache_dir.as_deref(),
822 cli.incremental,
823 );
824
825 if cli.cache_clear {
826 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
827 config.clear_contents()
828 })?;
829 }
830
831 if config.incremental_enabled() {
832 config.ensure_dirs()?;
833 }
834
835 Ok(config)
836}
837
838fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
839 LicenseCacheConfig::new(
840 cache_root.root_dir().to_path_buf(),
841 cli.reindex,
842 !cli.no_license_index_cache,
843 )
844}
845
846fn partition_incremental_files(
847 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
848 scan_root: &Path,
849 manifest: Option<&IncrementalManifest>,
850) -> Vec<FileInfo> {
851 let Some(manifest) = manifest else {
852 return Vec::new();
853 };
854
855 let mut files_to_scan = Vec::new();
856 let mut reused_files = Vec::new();
857
858 for (path, metadata) in collected_files.drain(..) {
859 let relative_path = normalize_relative_scan_path(&path, scan_root);
860 let Some(entry) = manifest.entry(&relative_path) else {
861 files_to_scan.push((path, metadata));
862 continue;
863 };
864
865 match manifest_entry_matches_path(entry, &path, &metadata) {
866 Ok(true) => reused_files.push(entry.file_info.clone()),
867 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
868 }
869 }
870
871 *collected_files = files_to_scan;
872 reused_files
873}
874
875fn merge_incremental_file_results(
876 processed_files: Vec<FileInfo>,
877 reused_files: Vec<FileInfo>,
878 ordered_file_paths: &[PathBuf],
879) -> Vec<FileInfo> {
880 let mut processed_file_entries = HashMap::new();
881 let mut directory_entries = Vec::new();
882 for file in processed_files {
883 if file.file_type == FileType::File {
884 processed_file_entries.insert(file.path.clone(), file);
885 } else {
886 directory_entries.push(file);
887 }
888 }
889
890 let mut reused_file_entries: HashMap<_, _> = reused_files
891 .into_iter()
892 .map(|file| (file.path.clone(), file))
893 .collect();
894
895 let mut merged_files = Vec::new();
896 for path in ordered_file_paths {
897 let path_string = path.to_string_lossy().to_string();
898 if let Some(file) = processed_file_entries.remove(&path_string) {
899 merged_files.push(file);
900 continue;
901 }
902
903 if let Some(file) = reused_file_entries.remove(&path_string) {
904 merged_files.push(file);
905 }
906 }
907
908 merged_files.extend(processed_file_entries.into_values());
909 merged_files.extend(reused_file_entries.into_values());
910 merged_files.extend(directory_entries);
911 merged_files
912}
913
914fn build_incremental_manifest(
915 scan_root: &Path,
916 collected_files: &[(PathBuf, fs::Metadata)],
917 files: &[FileInfo],
918 options_fingerprint: &str,
919) -> IncrementalManifest {
920 let files_by_relative_path: HashMap<_, _> = files
921 .iter()
922 .filter(|file| file.file_type == FileType::File)
923 .map(|file| {
924 (
925 normalize_relative_scan_path(Path::new(&file.path), scan_root),
926 file.clone(),
927 )
928 })
929 .collect();
930
931 let entries = collected_files
932 .iter()
933 .filter_map(|(path, metadata)| {
934 let relative_path = normalize_relative_scan_path(path, scan_root);
935 let state = metadata_fingerprint(metadata)?;
936 let file_info = files_by_relative_path.get(&relative_path)?.clone();
937 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
938 fs::read(path)
939 .map(|bytes| calculate_sha256(&bytes))
940 .unwrap_or_else(|_| {
941 Sha256Digest::from_hex(
942 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
943 )
944 .unwrap()
945 })
946 });
947 Some((
948 relative_path,
949 IncrementalManifestEntry {
950 state,
951 content_sha256,
952 file_info,
953 },
954 ))
955 })
956 .collect::<BTreeMap<_, _>>();
957
958 IncrementalManifest::new(options_fingerprint.to_string(), entries)
959}
960
961fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
962 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
963 calculate_sha256(
964 format!(
965 "{}\n{options_fingerprint}",
966 canonical_root.to_string_lossy()
967 )
968 .as_bytes(),
969 )
970 .as_hex()
971}
972
973fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
974 path.strip_prefix(scan_root)
975 .unwrap_or(path)
976 .to_string_lossy()
977 .replace('\\', "/")
978}
979
980fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
981 patterns
982 .iter()
983 .map(|pattern| {
984 Regex::new(pattern).map_err(|err| {
985 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
986 })
987 })
988 .collect()
989}
990
991fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
992 match process_mode {
993 ProcessMode::SequentialWithoutTimeouts => 0.0,
994 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
995 }
996}
997
998fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
999 match process_mode {
1000 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
1001 ProcessMode::SequentialWithoutTimeouts => {
1002 Some("Disabling multi-processing and multi-threading for debugging.")
1003 }
1004 ProcessMode::Parallel(_) => None,
1005 }
1006}
1007
1008fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
1009 if cli.quiet {
1010 ProgressMode::Quiet
1011 } else if cli.verbose {
1012 ProgressMode::Verbose
1013 } else {
1014 ProgressMode::Default
1015 }
1016}
1017
1018fn configured_scan_names(cli: &Cli) -> String {
1019 let mut names = Vec::new();
1020 if cli.license {
1021 names.push("licenses");
1022 }
1023 if cli.info {
1024 names.push("info");
1025 }
1026 if cli.package {
1027 names.push("packages");
1028 }
1029 if (cli.system_package || cli.package_in_compiled || cli.package_only)
1030 && !names.contains(&"packages")
1031 {
1032 names.push("packages");
1033 }
1034 if cli.copyright {
1035 names.push("copyrights");
1036 }
1037 if cli.email {
1038 names.push("emails");
1039 }
1040 if cli.url {
1041 names.push("urls");
1042 }
1043 names.join(", ")
1044}
1045
1046fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
1047 cli.info
1048 || files.iter().any(|file| {
1049 file.date.is_some()
1050 || file.sha1.is_some()
1051 || file.md5.is_some()
1052 || file.sha256.is_some()
1053 || file.sha1_git.is_some()
1054 || file.mime_type.is_some()
1055 || file.file_type_label.is_some()
1056 || file.programming_language.is_some()
1057 || file.is_binary.is_some()
1058 || file.is_text.is_some()
1059 || file.is_archive.is_some()
1060 || file.is_media.is_some()
1061 || file.is_source.is_some()
1062 || file.is_script.is_some()
1063 || file.files_count.is_some()
1064 || file.dirs_count.is_some()
1065 || file.size_count.is_some()
1066 })
1067}
1068
1069fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1070where
1071 F: FnOnce() -> T,
1072{
1073 let started = Instant::now();
1074 let result = f();
1075 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1076 result
1077}
1078
1079fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1080where
1081 F: FnOnce() -> Result<T> + Send,
1082 T: Send,
1083{
1084 let pool = rayon::ThreadPoolBuilder::new()
1085 .num_threads(threads.max(1))
1086 .build()?;
1087 pool.install(f)
1088}
1089
1090fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
1091 let cache_config = build_license_cache_config(cache_root, cli);
1092
1093 match &cli.license_dataset_path {
1094 Some(p) => {
1095 let path = PathBuf::from(p);
1096 if !path.exists() {
1097 return Err(anyhow!("License dataset path does not exist: {:?}", path));
1098 }
1099 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1100 Ok(Arc::new(engine))
1101 }
1102 None => {
1103 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1104 Ok(Arc::new(engine))
1105 }
1106 }
1107}
1108
1109fn describe_license_engine_source(
1110 engine: &LicenseDetectionEngine,
1111 rules_path: Option<&str>,
1112) -> String {
1113 match rules_path {
1114 Some(path) => format!(
1115 "License detection engine initialized with {} rules from custom dataset {}",
1116 engine.index().rules_by_rid.len(),
1117 path
1118 ),
1119 None => format!(
1120 "License detection engine initialized with {} rules from embedded artifact",
1121 engine.index().rules_by_rid.len()
1122 ),
1123 }
1124}
1125
1126#[cfg(test)]
1127mod tests;