1use crate::assembly;
5use crate::cache::{
6 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19 collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23 SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26 normalize_top_level_output_paths, populate_info_resource_counts,
27 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
28 trim_preloaded_assembly_to_files,
29};
30use crate::scanner::{
31 LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
32 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
33};
34use crate::time::format_scancode_timestamp;
35use crate::utils::hash::calculate_sha256;
36use anyhow::{Result, anyhow};
37use chrono::Utc;
38use clap::Parser;
39use regex::Regex;
40use std::collections::{BTreeMap, HashMap};
41use std::env;
42use std::fs;
43use std::io::Read;
44use std::path::{Path, PathBuf};
45use std::sync::Arc;
46use std::time::Instant;
47
48pub fn run() -> Result<()> {
49 #[cfg(feature = "golden-tests")]
50 touch_license_golden_symbols();
51
52 let cli = Cli::parse();
53
54 validate_scan_option_compatibility(&cli)?;
55
56 if cli.show_attribution {
57 print!("{}", include_str!("../../../NOTICE"));
58 return Ok(());
59 }
60
61 if let Some(export_dir) = cli.export_license_dataset.as_deref() {
62 export_embedded_license_dataset(Path::new(export_dir))?;
63 return Ok(());
64 }
65
66 let start_time = Utc::now();
67 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
68 progress.set_processes(cli.processes);
69 progress.set_scan_names(configured_scan_names(&cli));
70 progress.init_logging_bridge();
71 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
72
73 progress.start_setup();
74 let facet_rules = build_facet_rules(&cli.facet)?;
75
76 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
77 let ignore_copyright_holder_patterns =
78 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
79 progress.finish_setup();
80
81 progress.start_discovery();
82
83 let mut shared_cache_config = if cli.from_json {
84 let cache_config = prepare_cache_config(None, &cli)?;
85 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
86 Some(cache_config)
87 } else {
88 None
89 };
90
91 let (
92 mut scan_result,
93 total_dirs,
94 mut preloaded_assembly,
95 preloaded_license_detections,
96 preloaded_license_references,
97 preloaded_license_rule_references,
98 preloaded_extra_errors,
99 extra_warnings,
100 imported_spdx_license_list_version,
101 imported_license_index_provenance,
102 mut active_license_engine,
103 ) = if cli.from_json {
104 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
105 let directories_count = loaded.directory_count();
106 let files_count = loaded.file_count();
107 let size_count = loaded.file_size_count();
108 progress.finish_discovery(
109 files_count,
110 directories_count,
111 size_count,
112 loaded.excluded_count,
113 );
114 let (
115 process_result,
116 assembly_result,
117 license_detections,
118 license_references,
119 license_rule_references,
120 extra_errors,
121 imported_spdx_license_list_version,
122 imported_license_index_provenance,
123 ) = loaded.into_parts()?;
124 (
125 process_result,
126 directories_count,
127 assembly_result,
128 license_detections,
129 license_references,
130 license_rule_references,
131 extra_errors,
132 Vec::new(),
133 imported_spdx_license_list_version,
134 imported_license_index_provenance,
135 None,
136 )
137 } else {
138 let (scan_path, selected_paths, missing_paths_file_entries) =
139 resolve_native_scan_selection(&cli)?;
140 let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
141 for warning in &paths_file_warnings {
142 progress.output_written(warning);
143 }
144
145 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
146 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
147 shared_cache_config = Some(cache_config.clone());
148 let collection_exclude_patterns =
149 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
150
151 let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
152 let user_excluded_count = apply_user_path_filters_to_collected(
153 &mut collected,
154 Path::new(&scan_path),
155 &selected_paths,
156 &cli.include,
157 &cli.exclude,
158 );
159 let total_files = collected.file_count();
160 let total_dirs = collected.directory_count();
161 let total_size = collected.total_file_bytes;
162 let excluded_count = collected.excluded_count + user_excluded_count;
163 let all_collected_files = collected.files.clone();
164 let ordered_file_paths: Vec<PathBuf> = collected
165 .files
166 .iter()
167 .map(|(path, _)| path.clone())
168 .collect();
169 let runtime_errors = collected
170 .collection_errors
171 .iter()
172 .map(|(path, err)| format_default_scan_error(path, err))
173 .collect();
174 for (path, err) in &collected.collection_errors {
175 progress.record_runtime_error(path, err);
176 }
177 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
178 if !cli.quiet {
179 progress.output_written(&format!(
180 "Found {} files in {} directories ({} items excluded)",
181 total_files, total_dirs, excluded_count
182 ));
183 }
184
185 let license_engine = if cli.license {
186 progress.start_setup();
187 progress.start_license_detection_engine_creation();
188 let engine = init_license_engine(
189 shared_cache_config
190 .as_ref()
191 .expect("cache config should be prepared before license engine init"),
192 &cli,
193 )?;
194 progress.finish_license_detection_engine_creation("setup_scan:licenses");
195 progress.finish_setup();
196 progress.output_written(&describe_license_engine_source(
197 &engine,
198 cli.license_dataset_path.as_deref(),
199 ));
200 Some(engine)
201 } else {
202 None
203 };
204
205 let enable_application_packages = cli.package || cli.package_only;
206 let enable_system_packages = cli.system_package || cli.package_only;
207 let enable_packages =
208 enable_application_packages || enable_system_packages || cli.package_in_compiled;
209 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
210 {
211 (false, cli.email, cli.url, cli.generated)
212 } else {
213 (cli.copyright, cli.email, cli.url, cli.generated)
214 };
215 let process_mode = cli.processes;
216
217 let text_options = TextDetectionOptions {
218 collect_info: cli.info,
219 detect_packages: enable_packages,
220 detect_application_packages: enable_application_packages,
221 detect_system_packages: enable_system_packages,
222 detect_packages_in_compiled: cli.package_in_compiled,
223 detect_copyrights,
224 detect_generated,
225 detect_emails,
226 detect_urls,
227 max_emails: cli.max_email,
228 max_urls: cli.max_url,
229 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
230 };
231
232 let license_options = LicenseScanOptions {
233 include_text: cli.license_text,
234 include_text_diagnostics: cli.license_text_diagnostics,
235 include_diagnostics: cli.license_diagnostics,
236 unknown_licenses: cli.unknown_licenses,
237 min_score: cli.license_score,
238 };
239 let options_fingerprint =
240 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
241
242 if cli.incremental {
243 let manifest_path = incremental_manifest_path(
244 cache_config.root_dir(),
245 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
246 );
247 let previous_manifest =
248 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
249 let reused_files = partition_incremental_files(
250 &mut collected.files,
251 Path::new(&scan_path),
252 previous_manifest.as_ref(),
253 );
254 progress.record_incremental_reused(reused_files.len());
255 }
256
257 if let Some(message) = process_mode_message(process_mode) {
258 progress.output_written(message);
259 }
260 progress.start_scan(collected.file_count());
261 let mut result = match process_mode {
262 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
263 Ok(process_collected_with_memory_limit(
264 &collected,
265 Arc::clone(&progress),
266 license_engine.clone(),
267 license_options,
268 &text_options,
269 cli.max_in_memory,
270 ))
271 })?,
272 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
273 process_collected_with_memory_limit_sequential(
274 &collected,
275 Arc::clone(&progress),
276 license_engine.clone(),
277 license_options,
278 &text_options,
279 cli.max_in_memory,
280 )
281 }
282 };
283
284 if cli.incremental {
285 let manifest_path = incremental_manifest_path(
286 cache_config.root_dir(),
287 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
288 );
289 let reused_files = partition_incremental_files(
290 &mut all_collected_files.clone(),
291 Path::new(&scan_path),
292 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
293 );
294 result.files =
295 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
296
297 let manifest = build_incremental_manifest(
298 Path::new(&scan_path),
299 &all_collected_files,
300 &result.files,
301 &options_fingerprint,
302 );
303 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
304 }
305
306 result.excluded_count = excluded_count;
307 progress.finish_scan();
308
309 (
310 result,
311 total_dirs,
312 assembly::AssemblyResult {
313 packages: Vec::new(),
314 dependencies: Vec::new(),
315 },
316 Vec::new(),
317 Vec::new(),
318 Vec::new(),
319 runtime_errors,
320 paths_file_warnings,
321 None,
322 None,
323 license_engine,
324 )
325 };
326
327 progress.start_post_scan();
328
329 if cli.filter_clues {
330 progress.post_scan_step("Filtering redundant clues...");
331 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
332 prepare_filter_clue_rule_lookup(
333 &scan_result.files,
334 active_license_engine.as_deref(),
335 cli.license_dataset_path.as_deref(),
336 shared_license_cache_config.as_ref(),
337 )
338 })?;
339 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
340 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
341 } else {
342 filter_redundant_clues(&mut scan_result.files);
343 }
344 }
345
346 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
347 progress.post_scan_step("Applying ignore-resource filters...");
348 record_detail_timing(&progress, "post-scan:ignore-resource", || {
349 apply_ignore_resource_filter(
350 &mut scan_result.files,
351 &ignore_copyright_holder_patterns,
352 &ignore_author_patterns,
353 );
354 });
355 }
356
357 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
358 progress.post_scan_step("Applying path selection filters...");
359 record_detail_timing(&progress, "output-filter:path-selection", || {
360 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
361 });
362 }
363
364 if cli.only_findings {
365 progress.post_scan_step("Filtering to files with findings...");
366 record_detail_timing(&progress, "output-filter:only-findings", || {
367 apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
368 });
369 }
370
371 if cli.info && cli.mark_source {
372 progress.post_scan_step("Marking source files...");
373 record_detail_timing(&progress, "post-scan:mark-source", || {
374 apply_mark_source(&mut scan_result.files);
375 });
376 }
377
378 if should_include_info_surface(&scan_result.files, &cli) {
379 progress.post_scan_step("Populating info resource counts...");
380 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
381 populate_info_resource_counts(&mut scan_result.files);
382 });
383 }
384
385 progress.post_scan_step("Backfilling license provenance...");
386 record_detail_timing(&progress, "post-scan:license-provenance", || {
387 for file in &mut scan_result.files {
388 file.backfill_license_provenance();
389 }
390 });
391
392 if cli.from_json {
393 for err in &preloaded_extra_errors {
394 progress.record_additional_error(err);
395 }
396 }
397
398 let mut extra_errors = preloaded_extra_errors;
399 if let Some(policy_path) = cli.license_policy.as_deref() {
400 progress.post_scan_step("Applying license policy...");
401 let license_policy_errors =
402 record_detail_timing(&progress, "post-scan:license-policy", || {
403 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
404 })?;
405 for err in &license_policy_errors {
406 progress.record_additional_error(err);
407 }
408 extra_errors.extend(license_policy_errors);
409 }
410
411 if cli.from_json {
412 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
413 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
414 trim_preloaded_assembly_to_files(
415 &scan_result.files,
416 &mut preloaded_assembly.packages,
417 &mut preloaded_assembly.dependencies,
418 );
419 });
420 }
421
422 progress.finish_post_scan();
423
424 let manifests_seen = scan_result
425 .files
426 .iter()
427 .map(|file| file.package_data.len())
428 .sum();
429 let skip_assembly = cli.no_assemble || cli.package_only;
430
431 let mut assembly_result = if skip_assembly {
432 assembly::AssemblyResult {
433 packages: Vec::new(),
434 dependencies: Vec::new(),
435 }
436 } else {
437 progress.start_assembly();
438
439 let mut result = if cli.from_json
440 && (!preloaded_assembly.packages.is_empty()
441 || !preloaded_assembly.dependencies.is_empty())
442 {
443 progress.assembly_step("Using preloaded assembly...");
444 preloaded_assembly
445 } else {
446 assembly::assemble(&mut scan_result.files)
447 };
448
449 progress.assembly_step("Backfilling package license provenance...");
450 record_detail_timing(&progress, "assembly:package-license-provenance", || {
451 for package in &mut result.packages {
452 package.backfill_license_provenance();
453 }
454 });
455
456 progress.assembly_step("Applying package reference following...");
457 record_detail_timing(&progress, "assembly:package-reference-following", || {
458 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
459 });
460
461 progress.finish_assembly(result.packages.len(), manifests_seen);
462 result
463 };
464
465 progress.start_finalize();
466
467 if !cli.from_json && (cli.strip_root || cli.full_root) {
468 let root_path = cli
469 .dir_path
470 .first()
471 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
472 progress.finalize_step("Normalizing paths...");
473 record_detail_timing(&progress, "finalize:path-normalization", || {
474 normalize_paths(
475 &mut scan_result.files,
476 root_path,
477 cli.strip_root,
478 cli.full_root,
479 );
480 normalize_top_level_output_paths(
481 &mut assembly_result.packages,
482 &mut assembly_result.dependencies,
483 root_path,
484 cli.strip_root,
485 );
486 });
487 }
488
489 progress.finalize_step("Collecting license detections...");
490 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
491 let preserve_preloaded_top_level_detections = cli.from_json
492 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
493 collect_top_level_license_detections_for_mode(
494 &scan_result.files,
495 preloaded_license_detections,
496 preserve_preloaded_top_level_detections,
497 cli.from_json && cli.dir_path.len() > 1,
498 )
499 });
500
501 let should_recompute_license_references = cli.from_json
502 && (!preloaded_license_references.is_empty()
503 || !preloaded_license_rule_references.is_empty()
504 || cli.license_references
505 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
506 && !preloaded_license_references.is_empty()));
507
508 if should_recompute_license_references && active_license_engine.is_none() {
509 progress.start_license_detection_engine_creation();
510 active_license_engine = Some(init_license_engine(
511 shared_cache_config
512 .as_ref()
513 .expect("cache config should be prepared before license engine init"),
514 &cli,
515 )?);
516 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
517 }
518
519 progress.finalize_step("Collecting license references...");
520 let (license_references, license_rule_references) =
521 record_detail_timing(&progress, "finalize:license-references", || {
522 if cli.from_json && !should_recompute_license_references {
523 (
524 preloaded_license_references,
525 preloaded_license_rule_references,
526 )
527 } else if cli.license_references || should_recompute_license_references {
528 if let Some(engine) = active_license_engine.as_deref() {
529 collect_top_level_license_references(
530 &scan_result.files,
531 &assembly_result.packages,
532 engine.index(),
533 &cli.license_url_template,
534 )
535 } else {
536 (Vec::new(), Vec::new())
537 }
538 } else {
539 (Vec::new(), Vec::new())
540 }
541 });
542
543 let end_time = Utc::now();
544 let spdx_license_list_version = active_license_engine
545 .as_ref()
546 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
547 .or(imported_spdx_license_list_version)
548 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
549 let license_index_provenance = active_license_engine
550 .as_ref()
551 .and_then(|engine| engine.license_index_provenance().cloned())
552 .or(imported_license_index_provenance);
553
554 progress.finalize_step("Preparing output...");
555 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
556 create_output(
557 start_time,
558 end_time,
559 scan_result,
560 CreateOutputContext {
561 total_dirs,
562 assembly_result,
563 license_detections,
564 license_references,
565 license_rule_references,
566 spdx_license_list_version,
567 license_index_provenance,
568 extra_errors,
569 extra_warnings,
570 header_options: cli.output_header_options(),
571 options: CreateOutputOptions {
572 facet_rules: &facet_rules,
573 include_classify: cli.classify,
574 include_summary: cli.summary,
575 include_license_clarity_score: cli.license_clarity_score,
576 include_tallies: cli.tallies,
577 include_tallies_of_key_files: cli.tallies_key_files,
578 include_tallies_with_details: cli.tallies_with_details,
579 include_tallies_by_facet: cli.tallies_by_facet,
580 include_generated: cli.generated,
581 verbose: cli.verbose,
582 },
583 },
584 )
585 });
586 progress.finish_finalize();
587
588 let output_schema_output = crate::output_schema::Output::from(&output);
589 progress.start_output();
590 for target in cli.output_targets() {
591 let output_config = OutputWriteConfig {
592 format: target.format,
593 custom_template: target.custom_template.clone(),
594 scanned_path: if cli.dir_path.len() == 1 {
595 cli.dir_path.first().cloned()
596 } else {
597 None
598 },
599 };
600
601 let timing_name = format!("output:{:?}", target.format).to_lowercase();
602 record_detail_timing(&progress, timing_name, || {
603 write_output_file(&target.file, &output_schema_output, &output_config)
604 })?;
605 progress.output_written(&format!(
606 "{:?} output written to {}",
607 target.format, target.file
608 ));
609 }
610 progress.record_final_counts(&output.files);
611 progress.record_final_header_counts(&output.headers);
612 progress.finish_output();
613
614 let summary_end = Utc::now();
615 progress.display_summary(
616 &format_scancode_timestamp(&start_time),
617 &format_scancode_timestamp(&summary_end),
618 );
619
620 Ok(())
621}
622
623fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
624 if from_json {
625 files.clear();
626 } else {
627 apply_only_findings_filter(files);
628 }
629}
630
631fn collect_top_level_license_detections_for_mode(
632 files: &[FileInfo],
633 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
634 preserve_preloaded: bool,
635 clear_for_multi_input_replay: bool,
636) -> Vec<crate::models::TopLevelLicenseDetection> {
637 if clear_for_multi_input_replay {
638 Vec::new()
639 } else if preserve_preloaded {
640 preloaded
641 } else {
642 collect_top_level_license_detections(files)
643 }
644}
645
646#[cfg(feature = "golden-tests")]
647fn touch_license_golden_symbols() {
648 let _ = crate::license_detection::golden_utils::read_golden_input_content;
649 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
650 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
651 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
652}
653
654fn resolve_native_scan_selection(cli: &Cli) -> Result<(String, Vec<SelectedPath>, Vec<String>)> {
655 if cli.paths_file.is_empty() {
656 let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
657 return Ok((scan_path, selected_paths, Vec::new()));
658 }
659
660 let scan_path = cli
661 .dir_path
662 .first()
663 .cloned()
664 .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
665 let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
666 let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
667 if resolved.selections.is_empty() {
668 return Err(anyhow!(
669 "--paths-file did not resolve to any existing files or directories under {:?}",
670 Path::new(&scan_path)
671 ));
672 }
673
674 Ok((scan_path, resolved.selections, resolved.missing_entries))
675}
676
677fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
678 let mut entries = Vec::new();
679 for paths_file in paths_files {
680 let content = read_paths_file_content(paths_file)?;
681 entries.extend(content.lines().map(ToOwned::to_owned));
682 }
683 Ok(entries)
684}
685
686fn read_paths_file_content(paths_file: &str) -> Result<String> {
687 if paths_file == "-" {
688 let mut content = String::new();
689 std::io::stdin()
690 .read_to_string(&mut content)
691 .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
692 return Ok(content);
693 }
694
695 fs::read_to_string(paths_file)
696 .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
697}
698
699fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
700 missing_entries
701 .iter()
702 .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
703 .collect()
704}
705
706fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
707 if cli.show_attribution {
708 return Ok(());
709 }
710
711 if cli.export_license_dataset.is_some() {
712 if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() {
713 return Err(anyhow!(
714 "--export-license-dataset does not accept scan input paths or --paths-file"
715 ));
716 }
717
718 if cli.from_json
719 || cli.license
720 || cli.package
721 || cli.system_package
722 || cli.package_in_compiled
723 || cli.package_only
724 || cli.copyright
725 || cli.email
726 || cli.url
727 || cli.generated
728 || cli.info
729 || cli.incremental
730 || cli.reindex
731 || cli.no_license_index_cache
732 || cli.license_dataset_path.is_some()
733 {
734 return Err(anyhow!(
735 "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
736 ));
737 }
738
739 return Ok(());
740 }
741
742 if cli.from_json
743 && (cli.package
744 || cli.system_package
745 || cli.package_in_compiled
746 || cli.package_only
747 || cli.copyright
748 || cli.email
749 || cli.url
750 || cli.generated)
751 {
752 return Err(anyhow!(
753 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
754 ));
755 }
756
757 if cli.from_json && !cli.paths_file.is_empty() {
758 return Err(anyhow!(
759 "--paths-file is only supported for native scan mode, not --from-json"
760 ));
761 }
762
763 if cli.from_json && cli.incremental {
764 return Err(anyhow!(
765 "--incremental is only supported for directory scan mode, not --from-json"
766 ));
767 }
768
769 if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
770 return Err(anyhow!(
771 "--paths-file requires exactly one positional scan root"
772 ));
773 }
774
775 if !cli.from_json && cli.dir_path.is_empty() {
776 return Err(anyhow!("Directory path is required for scan operations"));
777 }
778
779 if cli.tallies_by_facet && cli.facet.is_empty() {
780 return Err(anyhow!(
781 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
782 ));
783 }
784
785 if cli.mark_source && !cli.info {
786 return Err(anyhow!("--mark-source requires --info"));
787 }
788
789 Ok(())
790}
791
792fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
793 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
794 let config = CacheConfig::from_overrides(
795 scan_root,
796 cli.cache_dir.as_deref().map(Path::new),
797 env_cache_dir.as_deref(),
798 cli.incremental,
799 );
800
801 if cli.cache_clear {
802 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
803 config.clear_contents()
804 })?;
805 }
806
807 if config.incremental_enabled() {
808 config.ensure_dirs()?;
809 }
810
811 Ok(config)
812}
813
814fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
815 LicenseCacheConfig::new(
816 cache_root.root_dir().to_path_buf(),
817 cli.reindex,
818 !cli.no_license_index_cache,
819 )
820}
821
822fn partition_incremental_files(
823 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
824 scan_root: &Path,
825 manifest: Option<&IncrementalManifest>,
826) -> Vec<FileInfo> {
827 let Some(manifest) = manifest else {
828 return Vec::new();
829 };
830
831 let mut files_to_scan = Vec::new();
832 let mut reused_files = Vec::new();
833
834 for (path, metadata) in collected_files.drain(..) {
835 let relative_path = normalize_relative_scan_path(&path, scan_root);
836 let Some(entry) = manifest.entry(&relative_path) else {
837 files_to_scan.push((path, metadata));
838 continue;
839 };
840
841 match manifest_entry_matches_path(entry, &path, &metadata) {
842 Ok(true) => reused_files.push(entry.file_info.clone()),
843 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
844 }
845 }
846
847 *collected_files = files_to_scan;
848 reused_files
849}
850
851fn merge_incremental_file_results(
852 processed_files: Vec<FileInfo>,
853 reused_files: Vec<FileInfo>,
854 ordered_file_paths: &[PathBuf],
855) -> Vec<FileInfo> {
856 let mut processed_file_entries = HashMap::new();
857 let mut directory_entries = Vec::new();
858 for file in processed_files {
859 if file.file_type == FileType::File {
860 processed_file_entries.insert(file.path.clone(), file);
861 } else {
862 directory_entries.push(file);
863 }
864 }
865
866 let mut reused_file_entries: HashMap<_, _> = reused_files
867 .into_iter()
868 .map(|file| (file.path.clone(), file))
869 .collect();
870
871 let mut merged_files = Vec::new();
872 for path in ordered_file_paths {
873 let path_string = path.to_string_lossy().to_string();
874 if let Some(file) = processed_file_entries.remove(&path_string) {
875 merged_files.push(file);
876 continue;
877 }
878
879 if let Some(file) = reused_file_entries.remove(&path_string) {
880 merged_files.push(file);
881 }
882 }
883
884 merged_files.extend(processed_file_entries.into_values());
885 merged_files.extend(reused_file_entries.into_values());
886 merged_files.extend(directory_entries);
887 merged_files
888}
889
890fn build_incremental_manifest(
891 scan_root: &Path,
892 collected_files: &[(PathBuf, fs::Metadata)],
893 files: &[FileInfo],
894 options_fingerprint: &str,
895) -> IncrementalManifest {
896 let files_by_relative_path: HashMap<_, _> = files
897 .iter()
898 .filter(|file| file.file_type == FileType::File)
899 .map(|file| {
900 (
901 normalize_relative_scan_path(Path::new(&file.path), scan_root),
902 file.clone(),
903 )
904 })
905 .collect();
906
907 let entries = collected_files
908 .iter()
909 .filter_map(|(path, metadata)| {
910 let relative_path = normalize_relative_scan_path(path, scan_root);
911 let state = metadata_fingerprint(metadata)?;
912 let file_info = files_by_relative_path.get(&relative_path)?.clone();
913 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
914 fs::read(path)
915 .map(|bytes| calculate_sha256(&bytes))
916 .unwrap_or_else(|_| {
917 Sha256Digest::from_hex(
918 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
919 )
920 .unwrap()
921 })
922 });
923 Some((
924 relative_path,
925 IncrementalManifestEntry {
926 state,
927 content_sha256,
928 file_info,
929 },
930 ))
931 })
932 .collect::<BTreeMap<_, _>>();
933
934 IncrementalManifest::new(options_fingerprint.to_string(), entries)
935}
936
937fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
938 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
939 calculate_sha256(
940 format!(
941 "{}\n{options_fingerprint}",
942 canonical_root.to_string_lossy()
943 )
944 .as_bytes(),
945 )
946 .as_hex()
947}
948
949fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
950 path.strip_prefix(scan_root)
951 .unwrap_or(path)
952 .to_string_lossy()
953 .replace('\\', "/")
954}
955
956fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
957 patterns
958 .iter()
959 .map(|pattern| {
960 Regex::new(pattern).map_err(|err| {
961 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
962 })
963 })
964 .collect()
965}
966
967fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
968 match process_mode {
969 ProcessMode::SequentialWithoutTimeouts => 0.0,
970 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
971 }
972}
973
974fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
975 match process_mode {
976 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
977 ProcessMode::SequentialWithoutTimeouts => {
978 Some("Disabling multi-processing and multi-threading for debugging.")
979 }
980 ProcessMode::Parallel(_) => None,
981 }
982}
983
984fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
985 if cli.quiet {
986 ProgressMode::Quiet
987 } else if cli.verbose {
988 ProgressMode::Verbose
989 } else {
990 ProgressMode::Default
991 }
992}
993
994fn configured_scan_names(cli: &Cli) -> String {
995 let mut names = Vec::new();
996 if cli.license {
997 names.push("licenses");
998 }
999 if cli.info {
1000 names.push("info");
1001 }
1002 if cli.package {
1003 names.push("packages");
1004 }
1005 if (cli.system_package || cli.package_in_compiled || cli.package_only)
1006 && !names.contains(&"packages")
1007 {
1008 names.push("packages");
1009 }
1010 if cli.copyright {
1011 names.push("copyrights");
1012 }
1013 if cli.email {
1014 names.push("emails");
1015 }
1016 if cli.url {
1017 names.push("urls");
1018 }
1019 names.join(", ")
1020}
1021
1022fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
1023 cli.info
1024 || files.iter().any(|file| {
1025 file.date.is_some()
1026 || file.sha1.is_some()
1027 || file.md5.is_some()
1028 || file.sha256.is_some()
1029 || file.sha1_git.is_some()
1030 || file.mime_type.is_some()
1031 || file.file_type_label.is_some()
1032 || file.programming_language.is_some()
1033 || file.is_binary.is_some()
1034 || file.is_text.is_some()
1035 || file.is_archive.is_some()
1036 || file.is_media.is_some()
1037 || file.is_source.is_some()
1038 || file.is_script.is_some()
1039 || file.files_count.is_some()
1040 || file.dirs_count.is_some()
1041 || file.size_count.is_some()
1042 })
1043}
1044
1045fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1046where
1047 F: FnOnce() -> T,
1048{
1049 let started = Instant::now();
1050 let result = f();
1051 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1052 result
1053}
1054
1055fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1056where
1057 F: FnOnce() -> Result<T> + Send,
1058 T: Send,
1059{
1060 let pool = rayon::ThreadPoolBuilder::new()
1061 .num_threads(threads.max(1))
1062 .build()?;
1063 pool.install(f)
1064}
1065
1066fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
1067 let cache_config = build_license_cache_config(cache_root, cli);
1068
1069 match &cli.license_dataset_path {
1070 Some(p) => {
1071 let path = PathBuf::from(p);
1072 if !path.exists() {
1073 return Err(anyhow!("License dataset path does not exist: {:?}", path));
1074 }
1075 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1076 Ok(Arc::new(engine))
1077 }
1078 None => {
1079 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1080 Ok(Arc::new(engine))
1081 }
1082 }
1083}
1084
1085fn describe_license_engine_source(
1086 engine: &LicenseDetectionEngine,
1087 rules_path: Option<&str>,
1088) -> String {
1089 match rules_path {
1090 Some(path) => format!(
1091 "License detection engine initialized with {} rules from custom dataset {}",
1092 engine.index().rules_by_rid.len(),
1093 path
1094 ),
1095 None => format!(
1096 "License detection engine initialized with {} rules from embedded artifact",
1097 engine.index().rules_by_rid.len()
1098 ),
1099 }
1100}
1101
1102#[cfg(test)]
1103mod tests;