1use crate::assembly;
2use crate::cache::{
3 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
4 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
5 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
6};
7use crate::cli::{Cli, ProcessMode};
8use crate::license_detection::LicenseDetectionEngine;
9use crate::license_detection::dataset::export_embedded_license_dataset;
10use crate::license_detection::license_cache::LicenseCacheConfig;
11use crate::models::{FileInfo, FileType, Sha256Digest};
12use crate::output::{OutputWriteConfig, write_output_file};
13use crate::post_processing::{
14 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
15 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
16 collect_top_level_license_detections, collect_top_level_license_references, create_output,
17};
18use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
19use crate::scan_result_shaping::{
20 apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
21 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
22 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
23 normalize_top_level_output_paths, populate_info_resource_counts,
24 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, trim_preloaded_assembly_to_files,
25};
26use crate::scanner::{
27 LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
28 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
29};
30use crate::time::format_scancode_timestamp;
31use crate::utils::hash::calculate_sha256;
32use anyhow::{Result, anyhow};
33use chrono::Utc;
34use clap::Parser;
35use regex::Regex;
36use std::collections::{BTreeMap, HashMap};
37use std::env;
38use std::fs;
39use std::path::{Path, PathBuf};
40use std::sync::Arc;
41use std::time::Instant;
42
43pub fn run() -> Result<()> {
44 #[cfg(feature = "golden-tests")]
45 touch_license_golden_symbols();
46
47 let cli = Cli::parse();
48
49 validate_scan_option_compatibility(&cli)?;
50
51 if cli.show_attribution {
52 print!("{}", include_str!("../../../NOTICE"));
53 return Ok(());
54 }
55
56 if let Some(export_dir) = cli.export_license_dataset.as_deref() {
57 export_embedded_license_dataset(Path::new(export_dir))?;
58 return Ok(());
59 }
60
61 let start_time = Utc::now();
62 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
63 progress.set_processes(cli.processes);
64 progress.set_scan_names(configured_scan_names(&cli));
65 progress.init_logging_bridge();
66 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
67
68 progress.start_setup();
69 let facet_rules = build_facet_rules(&cli.facet)?;
70
71 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
72 let ignore_copyright_holder_patterns =
73 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
74 progress.finish_setup();
75
76 progress.start_discovery();
77
78 let mut shared_cache_config = if cli.from_json {
79 let cache_config = prepare_cache_config(None, &cli)?;
80 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
81 Some(cache_config)
82 } else {
83 None
84 };
85
86 let (
87 mut scan_result,
88 total_dirs,
89 mut preloaded_assembly,
90 preloaded_license_detections,
91 preloaded_license_references,
92 preloaded_license_rule_references,
93 preloaded_extra_errors,
94 imported_spdx_license_list_version,
95 imported_license_index_provenance,
96 mut active_license_engine,
97 ) = if cli.from_json {
98 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
99 let directories_count = loaded.directory_count();
100 let files_count = loaded.file_count();
101 let size_count = loaded.file_size_count();
102 progress.finish_discovery(
103 files_count,
104 directories_count,
105 size_count,
106 loaded.excluded_count,
107 );
108 let (
109 process_result,
110 assembly_result,
111 license_detections,
112 license_references,
113 license_rule_references,
114 extra_errors,
115 imported_spdx_license_list_version,
116 imported_license_index_provenance,
117 ) = loaded.into_parts()?;
118 (
119 process_result,
120 directories_count,
121 assembly_result,
122 license_detections,
123 license_references,
124 license_rule_references,
125 extra_errors,
126 imported_spdx_license_list_version,
127 imported_license_index_provenance,
128 None,
129 )
130 } else {
131 let (scan_path, native_input_includes) = resolve_native_scan_inputs(&cli.dir_path)?;
132 let mut native_include_patterns = cli.include.clone();
133 native_include_patterns.extend(native_input_includes);
134
135 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
136 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
137 shared_cache_config = Some(cache_config.clone());
138 let collection_exclude_patterns =
139 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
140
141 let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
142 let user_excluded_count = apply_user_path_filters_to_collected(
143 &mut collected,
144 Path::new(&scan_path),
145 &native_include_patterns,
146 &cli.exclude,
147 );
148 let total_files = collected.file_count();
149 let total_dirs = collected.directory_count();
150 let total_size = collected.total_file_bytes;
151 let excluded_count = collected.excluded_count + user_excluded_count;
152 let all_collected_files = collected.files.clone();
153 let ordered_file_paths: Vec<PathBuf> = collected
154 .files
155 .iter()
156 .map(|(path, _)| path.clone())
157 .collect();
158 let runtime_errors = collected
159 .collection_errors
160 .iter()
161 .map(|(path, err)| format_default_scan_error(path, err))
162 .collect();
163 for (path, err) in &collected.collection_errors {
164 progress.record_runtime_error(path, err);
165 }
166 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
167 if !cli.quiet {
168 progress.output_written(&format!(
169 "Found {} files in {} directories ({} items excluded)",
170 total_files, total_dirs, excluded_count
171 ));
172 }
173
174 let license_engine = if cli.license {
175 progress.start_setup();
176 progress.start_license_detection_engine_creation();
177 let engine = init_license_engine(
178 shared_cache_config
179 .as_ref()
180 .expect("cache config should be prepared before license engine init"),
181 &cli,
182 )?;
183 progress.finish_license_detection_engine_creation("setup_scan:licenses");
184 progress.finish_setup();
185 progress.output_written(&describe_license_engine_source(
186 &engine,
187 cli.license_dataset_path.as_deref(),
188 ));
189 Some(engine)
190 } else {
191 None
192 };
193
194 let enable_application_packages = cli.package || cli.package_only;
195 let enable_system_packages = cli.system_package || cli.package_only;
196 let enable_packages =
197 enable_application_packages || enable_system_packages || cli.package_in_compiled;
198 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
199 {
200 (false, cli.email, cli.url, cli.generated)
201 } else {
202 (cli.copyright, cli.email, cli.url, cli.generated)
203 };
204 let process_mode = cli.processes;
205
206 let text_options = TextDetectionOptions {
207 collect_info: cli.info,
208 detect_packages: enable_packages,
209 detect_application_packages: enable_application_packages,
210 detect_system_packages: enable_system_packages,
211 detect_packages_in_compiled: cli.package_in_compiled,
212 detect_copyrights,
213 detect_generated,
214 detect_emails,
215 detect_urls,
216 max_emails: cli.max_email,
217 max_urls: cli.max_url,
218 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
219 };
220
221 let license_options = LicenseScanOptions {
222 include_text: cli.license_text,
223 include_text_diagnostics: cli.license_text_diagnostics,
224 include_diagnostics: cli.license_diagnostics,
225 unknown_licenses: cli.unknown_licenses,
226 min_score: cli.license_score,
227 };
228 let options_fingerprint =
229 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
230
231 if cli.incremental {
232 let manifest_path = incremental_manifest_path(
233 cache_config.root_dir(),
234 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
235 );
236 let previous_manifest =
237 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
238 let reused_files = partition_incremental_files(
239 &mut collected.files,
240 Path::new(&scan_path),
241 previous_manifest.as_ref(),
242 );
243 progress.record_incremental_reused(reused_files.len());
244 }
245
246 if let Some(message) = process_mode_message(process_mode) {
247 progress.output_written(message);
248 }
249 progress.start_scan(collected.file_count());
250 let mut result = match process_mode {
251 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
252 Ok(process_collected_with_memory_limit(
253 &collected,
254 Arc::clone(&progress),
255 license_engine.clone(),
256 license_options,
257 &text_options,
258 cli.max_in_memory,
259 ))
260 })?,
261 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
262 process_collected_with_memory_limit_sequential(
263 &collected,
264 Arc::clone(&progress),
265 license_engine.clone(),
266 license_options,
267 &text_options,
268 cli.max_in_memory,
269 )
270 }
271 };
272
273 if cli.incremental {
274 let manifest_path = incremental_manifest_path(
275 cache_config.root_dir(),
276 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
277 );
278 let reused_files = partition_incremental_files(
279 &mut all_collected_files.clone(),
280 Path::new(&scan_path),
281 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
282 );
283 result.files =
284 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
285
286 let manifest = build_incremental_manifest(
287 Path::new(&scan_path),
288 &all_collected_files,
289 &result.files,
290 &options_fingerprint,
291 );
292 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
293 }
294
295 result.excluded_count = excluded_count;
296 progress.finish_scan();
297
298 (
299 result,
300 total_dirs,
301 assembly::AssemblyResult {
302 packages: Vec::new(),
303 dependencies: Vec::new(),
304 },
305 Vec::new(),
306 Vec::new(),
307 Vec::new(),
308 runtime_errors,
309 None,
310 None,
311 license_engine,
312 )
313 };
314
315 progress.start_post_scan();
316
317 if cli.filter_clues {
318 progress.post_scan_step("Filtering redundant clues...");
319 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
320 prepare_filter_clue_rule_lookup(
321 &scan_result.files,
322 active_license_engine.as_deref(),
323 cli.license_dataset_path.as_deref(),
324 shared_license_cache_config.as_ref(),
325 )
326 })?;
327 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
328 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
329 } else {
330 filter_redundant_clues(&mut scan_result.files);
331 }
332 }
333
334 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
335 progress.post_scan_step("Applying ignore-resource filters...");
336 record_detail_timing(&progress, "post-scan:ignore-resource", || {
337 apply_ignore_resource_filter(
338 &mut scan_result.files,
339 &ignore_copyright_holder_patterns,
340 &ignore_author_patterns,
341 );
342 });
343 }
344
345 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
346 progress.post_scan_step("Applying path selection filters...");
347 record_detail_timing(&progress, "output-filter:path-selection", || {
348 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
349 });
350 }
351
352 if cli.only_findings {
353 progress.post_scan_step("Filtering to files with findings...");
354 record_detail_timing(&progress, "output-filter:only-findings", || {
355 apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
356 });
357 }
358
359 if cli.info && cli.mark_source {
360 progress.post_scan_step("Marking source files...");
361 record_detail_timing(&progress, "post-scan:mark-source", || {
362 apply_mark_source(&mut scan_result.files);
363 });
364 }
365
366 if should_include_info_surface(&scan_result.files, &cli) {
367 progress.post_scan_step("Populating info resource counts...");
368 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
369 populate_info_resource_counts(&mut scan_result.files);
370 });
371 }
372
373 progress.post_scan_step("Backfilling license provenance...");
374 record_detail_timing(&progress, "post-scan:license-provenance", || {
375 for file in &mut scan_result.files {
376 file.backfill_license_provenance();
377 }
378 });
379
380 if cli.from_json {
381 for err in &preloaded_extra_errors {
382 progress.record_additional_error(err);
383 }
384 }
385
386 let mut extra_errors = preloaded_extra_errors;
387 if let Some(policy_path) = cli.license_policy.as_deref() {
388 progress.post_scan_step("Applying license policy...");
389 let license_policy_errors =
390 record_detail_timing(&progress, "post-scan:license-policy", || {
391 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
392 })?;
393 for err in &license_policy_errors {
394 progress.record_additional_error(err);
395 }
396 extra_errors.extend(license_policy_errors);
397 }
398
399 if cli.from_json {
400 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
401 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
402 trim_preloaded_assembly_to_files(
403 &scan_result.files,
404 &mut preloaded_assembly.packages,
405 &mut preloaded_assembly.dependencies,
406 );
407 });
408 }
409
410 progress.finish_post_scan();
411
412 let manifests_seen = scan_result
413 .files
414 .iter()
415 .map(|file| file.package_data.len())
416 .sum();
417 let skip_assembly = cli.no_assemble || cli.package_only;
418
419 let mut assembly_result = if skip_assembly {
420 assembly::AssemblyResult {
421 packages: Vec::new(),
422 dependencies: Vec::new(),
423 }
424 } else {
425 progress.start_assembly();
426
427 let mut result = if cli.from_json
428 && (!preloaded_assembly.packages.is_empty()
429 || !preloaded_assembly.dependencies.is_empty())
430 {
431 progress.assembly_step("Using preloaded assembly...");
432 preloaded_assembly
433 } else {
434 assembly::assemble(&mut scan_result.files)
435 };
436
437 progress.assembly_step("Backfilling package license provenance...");
438 record_detail_timing(&progress, "assembly:package-license-provenance", || {
439 for package in &mut result.packages {
440 package.backfill_license_provenance();
441 }
442 });
443
444 progress.assembly_step("Applying package reference following...");
445 record_detail_timing(&progress, "assembly:package-reference-following", || {
446 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
447 });
448
449 progress.finish_assembly(result.packages.len(), manifests_seen);
450 result
451 };
452
453 progress.start_finalize();
454
455 if !cli.from_json && (cli.strip_root || cli.full_root) {
456 let root_path = cli
457 .dir_path
458 .first()
459 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
460 progress.finalize_step("Normalizing paths...");
461 record_detail_timing(&progress, "finalize:path-normalization", || {
462 normalize_paths(
463 &mut scan_result.files,
464 root_path,
465 cli.strip_root,
466 cli.full_root,
467 );
468 normalize_top_level_output_paths(
469 &mut assembly_result.packages,
470 &mut assembly_result.dependencies,
471 root_path,
472 cli.strip_root,
473 );
474 });
475 }
476
477 progress.finalize_step("Collecting license detections...");
478 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
479 let preserve_preloaded_top_level_detections = cli.from_json
480 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
481 collect_top_level_license_detections_for_mode(
482 &scan_result.files,
483 preloaded_license_detections,
484 preserve_preloaded_top_level_detections,
485 cli.from_json && cli.dir_path.len() > 1,
486 )
487 });
488
489 let should_recompute_license_references = cli.from_json
490 && (!preloaded_license_references.is_empty()
491 || !preloaded_license_rule_references.is_empty()
492 || cli.license_references
493 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
494 && !preloaded_license_references.is_empty()));
495
496 if should_recompute_license_references && active_license_engine.is_none() {
497 progress.start_license_detection_engine_creation();
498 active_license_engine = Some(init_license_engine(
499 shared_cache_config
500 .as_ref()
501 .expect("cache config should be prepared before license engine init"),
502 &cli,
503 )?);
504 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
505 }
506
507 progress.finalize_step("Collecting license references...");
508 let (license_references, license_rule_references) =
509 record_detail_timing(&progress, "finalize:license-references", || {
510 if cli.from_json && !should_recompute_license_references {
511 (
512 preloaded_license_references,
513 preloaded_license_rule_references,
514 )
515 } else if cli.license_references || should_recompute_license_references {
516 if let Some(engine) = active_license_engine.as_deref() {
517 collect_top_level_license_references(
518 &scan_result.files,
519 &assembly_result.packages,
520 engine.index(),
521 &cli.license_url_template,
522 )
523 } else {
524 (Vec::new(), Vec::new())
525 }
526 } else {
527 (Vec::new(), Vec::new())
528 }
529 });
530
531 let end_time = Utc::now();
532 let spdx_license_list_version = active_license_engine
533 .as_ref()
534 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
535 .or(imported_spdx_license_list_version)
536 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
537 let license_index_provenance = active_license_engine
538 .as_ref()
539 .and_then(|engine| engine.license_index_provenance().cloned())
540 .or(imported_license_index_provenance);
541
542 progress.finalize_step("Preparing output...");
543 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
544 create_output(
545 start_time,
546 end_time,
547 scan_result,
548 CreateOutputContext {
549 total_dirs,
550 assembly_result,
551 license_detections,
552 license_references,
553 license_rule_references,
554 spdx_license_list_version,
555 license_index_provenance,
556 extra_errors,
557 extra_warnings: Vec::new(),
558 header_options: cli.output_header_options(),
559 options: CreateOutputOptions {
560 facet_rules: &facet_rules,
561 include_classify: cli.classify,
562 include_summary: cli.summary,
563 include_license_clarity_score: cli.license_clarity_score,
564 include_tallies: cli.tallies,
565 include_tallies_of_key_files: cli.tallies_key_files,
566 include_tallies_with_details: cli.tallies_with_details,
567 include_tallies_by_facet: cli.tallies_by_facet,
568 include_generated: cli.generated,
569 verbose: cli.verbose,
570 },
571 },
572 )
573 });
574 progress.finish_finalize();
575
576 let output_schema_output = crate::output_schema::Output::from(&output);
577 progress.start_output();
578 for target in cli.output_targets() {
579 let output_config = OutputWriteConfig {
580 format: target.format,
581 custom_template: target.custom_template.clone(),
582 scanned_path: if cli.dir_path.len() == 1 {
583 cli.dir_path.first().cloned()
584 } else {
585 None
586 },
587 };
588
589 let timing_name = format!("output:{:?}", target.format).to_lowercase();
590 record_detail_timing(&progress, timing_name, || {
591 write_output_file(&target.file, &output_schema_output, &output_config)
592 })?;
593 progress.output_written(&format!(
594 "{:?} output written to {}",
595 target.format, target.file
596 ));
597 }
598 progress.record_final_counts(&output.files);
599 progress.finish_output();
600
601 let summary_end = Utc::now();
602 progress.display_summary(
603 &format_scancode_timestamp(&start_time),
604 &format_scancode_timestamp(&summary_end),
605 );
606
607 Ok(())
608}
609
610fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
611 if from_json {
612 files.clear();
613 } else {
614 apply_only_findings_filter(files);
615 }
616}
617
618fn collect_top_level_license_detections_for_mode(
619 files: &[FileInfo],
620 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
621 preserve_preloaded: bool,
622 clear_for_multi_input_replay: bool,
623) -> Vec<crate::models::TopLevelLicenseDetection> {
624 if clear_for_multi_input_replay {
625 Vec::new()
626 } else if preserve_preloaded {
627 preloaded
628 } else {
629 collect_top_level_license_detections(files)
630 }
631}
632
633#[cfg(feature = "golden-tests")]
634fn touch_license_golden_symbols() {
635 let _ = crate::license_detection::golden_utils::read_golden_input_content;
636 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
637 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
638 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
639}
640
641fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
642 if cli.show_attribution {
643 return Ok(());
644 }
645
646 if cli.export_license_dataset.is_some() {
647 if !cli.dir_path.is_empty() {
648 return Err(anyhow!(
649 "--export-license-dataset does not accept scan input paths"
650 ));
651 }
652
653 if cli.from_json
654 || cli.license
655 || cli.package
656 || cli.system_package
657 || cli.package_in_compiled
658 || cli.package_only
659 || cli.copyright
660 || cli.email
661 || cli.url
662 || cli.generated
663 || cli.info
664 || cli.incremental
665 || cli.reindex
666 || cli.no_license_index_cache
667 || cli.license_dataset_path.is_some()
668 {
669 return Err(anyhow!(
670 "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
671 ));
672 }
673
674 return Ok(());
675 }
676
677 if cli.from_json
678 && (cli.package
679 || cli.system_package
680 || cli.package_in_compiled
681 || cli.package_only
682 || cli.copyright
683 || cli.email
684 || cli.url
685 || cli.generated)
686 {
687 return Err(anyhow!(
688 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
689 ));
690 }
691
692 if cli.from_json && cli.incremental {
693 return Err(anyhow!(
694 "--incremental is only supported for directory scan mode, not --from-json"
695 ));
696 }
697
698 if !cli.from_json && cli.dir_path.is_empty() {
699 return Err(anyhow!("Directory path is required for scan operations"));
700 }
701
702 if cli.tallies_by_facet && cli.facet.is_empty() {
703 return Err(anyhow!(
704 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
705 ));
706 }
707
708 if cli.mark_source && !cli.info {
709 return Err(anyhow!("--mark-source requires --info"));
710 }
711
712 Ok(())
713}
714
715fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
716 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
717 let config = CacheConfig::from_overrides(
718 scan_root,
719 cli.cache_dir.as_deref().map(Path::new),
720 env_cache_dir.as_deref(),
721 cli.incremental,
722 );
723
724 if cli.cache_clear {
725 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
726 config.clear_contents()
727 })?;
728 }
729
730 if config.incremental_enabled() {
731 config.ensure_dirs()?;
732 }
733
734 Ok(config)
735}
736
737fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
738 LicenseCacheConfig::new(
739 cache_root.root_dir().to_path_buf(),
740 cli.reindex,
741 !cli.no_license_index_cache,
742 )
743}
744
745fn partition_incremental_files(
746 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
747 scan_root: &Path,
748 manifest: Option<&IncrementalManifest>,
749) -> Vec<FileInfo> {
750 let Some(manifest) = manifest else {
751 return Vec::new();
752 };
753
754 let mut files_to_scan = Vec::new();
755 let mut reused_files = Vec::new();
756
757 for (path, metadata) in collected_files.drain(..) {
758 let relative_path = normalize_relative_scan_path(&path, scan_root);
759 let Some(entry) = manifest.entry(&relative_path) else {
760 files_to_scan.push((path, metadata));
761 continue;
762 };
763
764 match manifest_entry_matches_path(entry, &path, &metadata) {
765 Ok(true) => reused_files.push(entry.file_info.clone()),
766 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
767 }
768 }
769
770 *collected_files = files_to_scan;
771 reused_files
772}
773
774fn merge_incremental_file_results(
775 processed_files: Vec<FileInfo>,
776 reused_files: Vec<FileInfo>,
777 ordered_file_paths: &[PathBuf],
778) -> Vec<FileInfo> {
779 let mut processed_file_entries = HashMap::new();
780 let mut directory_entries = Vec::new();
781 for file in processed_files {
782 if file.file_type == FileType::File {
783 processed_file_entries.insert(file.path.clone(), file);
784 } else {
785 directory_entries.push(file);
786 }
787 }
788
789 let mut reused_file_entries: HashMap<_, _> = reused_files
790 .into_iter()
791 .map(|file| (file.path.clone(), file))
792 .collect();
793
794 let mut merged_files = Vec::new();
795 for path in ordered_file_paths {
796 let path_string = path.to_string_lossy().to_string();
797 if let Some(file) = processed_file_entries.remove(&path_string) {
798 merged_files.push(file);
799 continue;
800 }
801
802 if let Some(file) = reused_file_entries.remove(&path_string) {
803 merged_files.push(file);
804 }
805 }
806
807 merged_files.extend(processed_file_entries.into_values());
808 merged_files.extend(reused_file_entries.into_values());
809 merged_files.extend(directory_entries);
810 merged_files
811}
812
813fn build_incremental_manifest(
814 scan_root: &Path,
815 collected_files: &[(PathBuf, fs::Metadata)],
816 files: &[FileInfo],
817 options_fingerprint: &str,
818) -> IncrementalManifest {
819 let files_by_relative_path: HashMap<_, _> = files
820 .iter()
821 .filter(|file| file.file_type == FileType::File)
822 .map(|file| {
823 (
824 normalize_relative_scan_path(Path::new(&file.path), scan_root),
825 file.clone(),
826 )
827 })
828 .collect();
829
830 let entries = collected_files
831 .iter()
832 .filter_map(|(path, metadata)| {
833 let relative_path = normalize_relative_scan_path(path, scan_root);
834 let state = metadata_fingerprint(metadata)?;
835 let file_info = files_by_relative_path.get(&relative_path)?.clone();
836 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
837 fs::read(path)
838 .map(|bytes| calculate_sha256(&bytes))
839 .unwrap_or_else(|_| {
840 Sha256Digest::from_hex(
841 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
842 )
843 .unwrap()
844 })
845 });
846 Some((
847 relative_path,
848 IncrementalManifestEntry {
849 state,
850 content_sha256,
851 file_info,
852 },
853 ))
854 })
855 .collect::<BTreeMap<_, _>>();
856
857 IncrementalManifest::new(options_fingerprint.to_string(), entries)
858}
859
860fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
861 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
862 calculate_sha256(
863 format!(
864 "{}\n{options_fingerprint}",
865 canonical_root.to_string_lossy()
866 )
867 .as_bytes(),
868 )
869 .as_hex()
870}
871
872fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
873 path.strip_prefix(scan_root)
874 .unwrap_or(path)
875 .to_string_lossy()
876 .replace('\\', "/")
877}
878
879fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
880 patterns
881 .iter()
882 .map(|pattern| {
883 Regex::new(pattern).map_err(|err| {
884 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
885 })
886 })
887 .collect()
888}
889
890fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
891 match process_mode {
892 ProcessMode::SequentialWithoutTimeouts => 0.0,
893 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
894 }
895}
896
897fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
898 match process_mode {
899 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
900 ProcessMode::SequentialWithoutTimeouts => {
901 Some("Disabling multi-processing and multi-threading for debugging.")
902 }
903 ProcessMode::Parallel(_) => None,
904 }
905}
906
907fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
908 if cli.quiet {
909 ProgressMode::Quiet
910 } else if cli.verbose {
911 ProgressMode::Verbose
912 } else {
913 ProgressMode::Default
914 }
915}
916
917fn configured_scan_names(cli: &Cli) -> String {
918 let mut names = Vec::new();
919 if cli.license {
920 names.push("licenses");
921 }
922 if cli.info {
923 names.push("info");
924 }
925 if cli.package {
926 names.push("packages");
927 }
928 if (cli.system_package || cli.package_in_compiled || cli.package_only)
929 && !names.contains(&"packages")
930 {
931 names.push("packages");
932 }
933 if cli.copyright {
934 names.push("copyrights");
935 }
936 if cli.email {
937 names.push("emails");
938 }
939 if cli.url {
940 names.push("urls");
941 }
942 names.join(", ")
943}
944
945fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
946 cli.info
947 || files.iter().any(|file| {
948 file.date.is_some()
949 || file.sha1.is_some()
950 || file.md5.is_some()
951 || file.sha256.is_some()
952 || file.sha1_git.is_some()
953 || file.mime_type.is_some()
954 || file.file_type_label.is_some()
955 || file.programming_language.is_some()
956 || file.is_binary.is_some()
957 || file.is_text.is_some()
958 || file.is_archive.is_some()
959 || file.is_media.is_some()
960 || file.is_source.is_some()
961 || file.is_script.is_some()
962 || file.files_count.is_some()
963 || file.dirs_count.is_some()
964 || file.size_count.is_some()
965 })
966}
967
968fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
969where
970 F: FnOnce() -> T,
971{
972 let started = Instant::now();
973 let result = f();
974 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
975 result
976}
977
978fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
979where
980 F: FnOnce() -> Result<T> + Send,
981 T: Send,
982{
983 let pool = rayon::ThreadPoolBuilder::new()
984 .num_threads(threads.max(1))
985 .build()?;
986 pool.install(f)
987}
988
989fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
990 let cache_config = build_license_cache_config(cache_root, cli);
991
992 match &cli.license_dataset_path {
993 Some(p) => {
994 let path = PathBuf::from(p);
995 if !path.exists() {
996 return Err(anyhow!("License dataset path does not exist: {:?}", path));
997 }
998 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
999 Ok(Arc::new(engine))
1000 }
1001 None => {
1002 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1003 Ok(Arc::new(engine))
1004 }
1005 }
1006}
1007
1008fn describe_license_engine_source(
1009 engine: &LicenseDetectionEngine,
1010 rules_path: Option<&str>,
1011) -> String {
1012 match rules_path {
1013 Some(path) => format!(
1014 "License detection engine initialized with {} rules from custom dataset {}",
1015 engine.index().rules_by_rid.len(),
1016 path
1017 ),
1018 None => format!(
1019 "License detection engine initialized with {} rules from embedded artifact",
1020 engine.index().rules_by_rid.len()
1021 ),
1022 }
1023}
1024
1025#[cfg(test)]
1026mod tests;