1use crate::assembly;
5use crate::cache::{
6 CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7 build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8 manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17 CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18 apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19 collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23 apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24 apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25 filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26 normalize_top_level_output_paths, populate_info_resource_counts,
27 prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, trim_preloaded_assembly_to_files,
28};
29use crate::scanner::{
30 LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
31 process_collected_with_memory_limit_sequential, scan_options_fingerprint,
32};
33use crate::time::format_scancode_timestamp;
34use crate::utils::hash::calculate_sha256;
35use anyhow::{Result, anyhow};
36use chrono::Utc;
37use clap::Parser;
38use regex::Regex;
39use std::collections::{BTreeMap, HashMap};
40use std::env;
41use std::fs;
42use std::path::{Path, PathBuf};
43use std::sync::Arc;
44use std::time::Instant;
45
46pub fn run() -> Result<()> {
47 #[cfg(feature = "golden-tests")]
48 touch_license_golden_symbols();
49
50 let cli = Cli::parse();
51
52 validate_scan_option_compatibility(&cli)?;
53
54 if cli.show_attribution {
55 print!("{}", include_str!("../../../NOTICE"));
56 return Ok(());
57 }
58
59 if let Some(export_dir) = cli.export_license_dataset.as_deref() {
60 export_embedded_license_dataset(Path::new(export_dir))?;
61 return Ok(());
62 }
63
64 let start_time = Utc::now();
65 let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
66 progress.set_processes(cli.processes);
67 progress.set_scan_names(configured_scan_names(&cli));
68 progress.init_logging_bridge();
69 let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
70
71 progress.start_setup();
72 let facet_rules = build_facet_rules(&cli.facet)?;
73
74 let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
75 let ignore_copyright_holder_patterns =
76 compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
77 progress.finish_setup();
78
79 progress.start_discovery();
80
81 let mut shared_cache_config = if cli.from_json {
82 let cache_config = prepare_cache_config(None, &cli)?;
83 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
84 Some(cache_config)
85 } else {
86 None
87 };
88
89 let (
90 mut scan_result,
91 total_dirs,
92 mut preloaded_assembly,
93 preloaded_license_detections,
94 preloaded_license_references,
95 preloaded_license_rule_references,
96 preloaded_extra_errors,
97 imported_spdx_license_list_version,
98 imported_license_index_provenance,
99 mut active_license_engine,
100 ) = if cli.from_json {
101 let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
102 let directories_count = loaded.directory_count();
103 let files_count = loaded.file_count();
104 let size_count = loaded.file_size_count();
105 progress.finish_discovery(
106 files_count,
107 directories_count,
108 size_count,
109 loaded.excluded_count,
110 );
111 let (
112 process_result,
113 assembly_result,
114 license_detections,
115 license_references,
116 license_rule_references,
117 extra_errors,
118 imported_spdx_license_list_version,
119 imported_license_index_provenance,
120 ) = loaded.into_parts()?;
121 (
122 process_result,
123 directories_count,
124 assembly_result,
125 license_detections,
126 license_references,
127 license_rule_references,
128 extra_errors,
129 imported_spdx_license_list_version,
130 imported_license_index_provenance,
131 None,
132 )
133 } else {
134 let (scan_path, native_input_includes) = resolve_native_scan_inputs(&cli.dir_path)?;
135 let mut native_include_patterns = cli.include.clone();
136 native_include_patterns.extend(native_input_includes);
137
138 let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
139 shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
140 shared_cache_config = Some(cache_config.clone());
141 let collection_exclude_patterns =
142 build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
143
144 let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
145 let user_excluded_count = apply_user_path_filters_to_collected(
146 &mut collected,
147 Path::new(&scan_path),
148 &native_include_patterns,
149 &cli.exclude,
150 );
151 let total_files = collected.file_count();
152 let total_dirs = collected.directory_count();
153 let total_size = collected.total_file_bytes;
154 let excluded_count = collected.excluded_count + user_excluded_count;
155 let all_collected_files = collected.files.clone();
156 let ordered_file_paths: Vec<PathBuf> = collected
157 .files
158 .iter()
159 .map(|(path, _)| path.clone())
160 .collect();
161 let runtime_errors = collected
162 .collection_errors
163 .iter()
164 .map(|(path, err)| format_default_scan_error(path, err))
165 .collect();
166 for (path, err) in &collected.collection_errors {
167 progress.record_runtime_error(path, err);
168 }
169 progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
170 if !cli.quiet {
171 progress.output_written(&format!(
172 "Found {} files in {} directories ({} items excluded)",
173 total_files, total_dirs, excluded_count
174 ));
175 }
176
177 let license_engine = if cli.license {
178 progress.start_setup();
179 progress.start_license_detection_engine_creation();
180 let engine = init_license_engine(
181 shared_cache_config
182 .as_ref()
183 .expect("cache config should be prepared before license engine init"),
184 &cli,
185 )?;
186 progress.finish_license_detection_engine_creation("setup_scan:licenses");
187 progress.finish_setup();
188 progress.output_written(&describe_license_engine_source(
189 &engine,
190 cli.license_dataset_path.as_deref(),
191 ));
192 Some(engine)
193 } else {
194 None
195 };
196
197 let enable_application_packages = cli.package || cli.package_only;
198 let enable_system_packages = cli.system_package || cli.package_only;
199 let enable_packages =
200 enable_application_packages || enable_system_packages || cli.package_in_compiled;
201 let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
202 {
203 (false, cli.email, cli.url, cli.generated)
204 } else {
205 (cli.copyright, cli.email, cli.url, cli.generated)
206 };
207 let process_mode = cli.processes;
208
209 let text_options = TextDetectionOptions {
210 collect_info: cli.info,
211 detect_packages: enable_packages,
212 detect_application_packages: enable_application_packages,
213 detect_system_packages: enable_system_packages,
214 detect_packages_in_compiled: cli.package_in_compiled,
215 detect_copyrights,
216 detect_generated,
217 detect_emails,
218 detect_urls,
219 max_emails: cli.max_email,
220 max_urls: cli.max_url,
221 timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
222 };
223
224 let license_options = LicenseScanOptions {
225 include_text: cli.license_text,
226 include_text_diagnostics: cli.license_text_diagnostics,
227 include_diagnostics: cli.license_diagnostics,
228 unknown_licenses: cli.unknown_licenses,
229 min_score: cli.license_score,
230 };
231 let options_fingerprint =
232 scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
233
234 if cli.incremental {
235 let manifest_path = incremental_manifest_path(
236 cache_config.root_dir(),
237 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
238 );
239 let previous_manifest =
240 load_incremental_manifest(&manifest_path, &options_fingerprint)?;
241 let reused_files = partition_incremental_files(
242 &mut collected.files,
243 Path::new(&scan_path),
244 previous_manifest.as_ref(),
245 );
246 progress.record_incremental_reused(reused_files.len());
247 }
248
249 if let Some(message) = process_mode_message(process_mode) {
250 progress.output_written(message);
251 }
252 progress.start_scan(collected.file_count());
253 let mut result = match process_mode {
254 ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
255 Ok(process_collected_with_memory_limit(
256 &collected,
257 Arc::clone(&progress),
258 license_engine.clone(),
259 license_options,
260 &text_options,
261 cli.max_in_memory,
262 ))
263 })?,
264 ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
265 process_collected_with_memory_limit_sequential(
266 &collected,
267 Arc::clone(&progress),
268 license_engine.clone(),
269 license_options,
270 &text_options,
271 cli.max_in_memory,
272 )
273 }
274 };
275
276 if cli.incremental {
277 let manifest_path = incremental_manifest_path(
278 cache_config.root_dir(),
279 &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
280 );
281 let reused_files = partition_incremental_files(
282 &mut all_collected_files.clone(),
283 Path::new(&scan_path),
284 load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
285 );
286 result.files =
287 merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
288
289 let manifest = build_incremental_manifest(
290 Path::new(&scan_path),
291 &all_collected_files,
292 &result.files,
293 &options_fingerprint,
294 );
295 write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
296 }
297
298 result.excluded_count = excluded_count;
299 progress.finish_scan();
300
301 (
302 result,
303 total_dirs,
304 assembly::AssemblyResult {
305 packages: Vec::new(),
306 dependencies: Vec::new(),
307 },
308 Vec::new(),
309 Vec::new(),
310 Vec::new(),
311 runtime_errors,
312 None,
313 None,
314 license_engine,
315 )
316 };
317
318 progress.start_post_scan();
319
320 if cli.filter_clues {
321 progress.post_scan_step("Filtering redundant clues...");
322 let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
323 prepare_filter_clue_rule_lookup(
324 &scan_result.files,
325 active_license_engine.as_deref(),
326 cli.license_dataset_path.as_deref(),
327 shared_license_cache_config.as_ref(),
328 )
329 })?;
330 if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
331 filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
332 } else {
333 filter_redundant_clues(&mut scan_result.files);
334 }
335 }
336
337 if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
338 progress.post_scan_step("Applying ignore-resource filters...");
339 record_detail_timing(&progress, "post-scan:ignore-resource", || {
340 apply_ignore_resource_filter(
341 &mut scan_result.files,
342 &ignore_copyright_holder_patterns,
343 &ignore_author_patterns,
344 );
345 });
346 }
347
348 if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
349 progress.post_scan_step("Applying path selection filters...");
350 record_detail_timing(&progress, "output-filter:path-selection", || {
351 apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
352 });
353 }
354
355 if cli.only_findings {
356 progress.post_scan_step("Filtering to files with findings...");
357 record_detail_timing(&progress, "output-filter:only-findings", || {
358 apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
359 });
360 }
361
362 if cli.info && cli.mark_source {
363 progress.post_scan_step("Marking source files...");
364 record_detail_timing(&progress, "post-scan:mark-source", || {
365 apply_mark_source(&mut scan_result.files);
366 });
367 }
368
369 if should_include_info_surface(&scan_result.files, &cli) {
370 progress.post_scan_step("Populating info resource counts...");
371 record_detail_timing(&progress, "post-scan:info-resource-counts", || {
372 populate_info_resource_counts(&mut scan_result.files);
373 });
374 }
375
376 progress.post_scan_step("Backfilling license provenance...");
377 record_detail_timing(&progress, "post-scan:license-provenance", || {
378 for file in &mut scan_result.files {
379 file.backfill_license_provenance();
380 }
381 });
382
383 if cli.from_json {
384 for err in &preloaded_extra_errors {
385 progress.record_additional_error(err);
386 }
387 }
388
389 let mut extra_errors = preloaded_extra_errors;
390 if let Some(policy_path) = cli.license_policy.as_deref() {
391 progress.post_scan_step("Applying license policy...");
392 let license_policy_errors =
393 record_detail_timing(&progress, "post-scan:license-policy", || {
394 apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
395 })?;
396 for err in &license_policy_errors {
397 progress.record_additional_error(err);
398 }
399 extra_errors.extend(license_policy_errors);
400 }
401
402 if cli.from_json {
403 progress.post_scan_step("Trimming preloaded assembly to filtered files...");
404 record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
405 trim_preloaded_assembly_to_files(
406 &scan_result.files,
407 &mut preloaded_assembly.packages,
408 &mut preloaded_assembly.dependencies,
409 );
410 });
411 }
412
413 progress.finish_post_scan();
414
415 let manifests_seen = scan_result
416 .files
417 .iter()
418 .map(|file| file.package_data.len())
419 .sum();
420 let skip_assembly = cli.no_assemble || cli.package_only;
421
422 let mut assembly_result = if skip_assembly {
423 assembly::AssemblyResult {
424 packages: Vec::new(),
425 dependencies: Vec::new(),
426 }
427 } else {
428 progress.start_assembly();
429
430 let mut result = if cli.from_json
431 && (!preloaded_assembly.packages.is_empty()
432 || !preloaded_assembly.dependencies.is_empty())
433 {
434 progress.assembly_step("Using preloaded assembly...");
435 preloaded_assembly
436 } else {
437 assembly::assemble(&mut scan_result.files)
438 };
439
440 progress.assembly_step("Backfilling package license provenance...");
441 record_detail_timing(&progress, "assembly:package-license-provenance", || {
442 for package in &mut result.packages {
443 package.backfill_license_provenance();
444 }
445 });
446
447 progress.assembly_step("Applying package reference following...");
448 record_detail_timing(&progress, "assembly:package-reference-following", || {
449 apply_package_reference_following(&mut scan_result.files, &mut result.packages);
450 });
451
452 progress.finish_assembly(result.packages.len(), manifests_seen);
453 result
454 };
455
456 progress.start_finalize();
457
458 if !cli.from_json && (cli.strip_root || cli.full_root) {
459 let root_path = cli
460 .dir_path
461 .first()
462 .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
463 progress.finalize_step("Normalizing paths...");
464 record_detail_timing(&progress, "finalize:path-normalization", || {
465 normalize_paths(
466 &mut scan_result.files,
467 root_path,
468 cli.strip_root,
469 cli.full_root,
470 );
471 normalize_top_level_output_paths(
472 &mut assembly_result.packages,
473 &mut assembly_result.dependencies,
474 root_path,
475 cli.strip_root,
476 );
477 });
478 }
479
480 progress.finalize_step("Collecting license detections...");
481 let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
482 let preserve_preloaded_top_level_detections = cli.from_json
483 && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
484 collect_top_level_license_detections_for_mode(
485 &scan_result.files,
486 preloaded_license_detections,
487 preserve_preloaded_top_level_detections,
488 cli.from_json && cli.dir_path.len() > 1,
489 )
490 });
491
492 let should_recompute_license_references = cli.from_json
493 && (!preloaded_license_references.is_empty()
494 || !preloaded_license_rule_references.is_empty()
495 || cli.license_references
496 || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
497 && !preloaded_license_references.is_empty()));
498
499 if should_recompute_license_references && active_license_engine.is_none() {
500 progress.start_license_detection_engine_creation();
501 active_license_engine = Some(init_license_engine(
502 shared_cache_config
503 .as_ref()
504 .expect("cache config should be prepared before license engine init"),
505 &cli,
506 )?);
507 progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
508 }
509
510 progress.finalize_step("Collecting license references...");
511 let (license_references, license_rule_references) =
512 record_detail_timing(&progress, "finalize:license-references", || {
513 if cli.from_json && !should_recompute_license_references {
514 (
515 preloaded_license_references,
516 preloaded_license_rule_references,
517 )
518 } else if cli.license_references || should_recompute_license_references {
519 if let Some(engine) = active_license_engine.as_deref() {
520 collect_top_level_license_references(
521 &scan_result.files,
522 &assembly_result.packages,
523 engine.index(),
524 &cli.license_url_template,
525 )
526 } else {
527 (Vec::new(), Vec::new())
528 }
529 } else {
530 (Vec::new(), Vec::new())
531 }
532 });
533
534 let end_time = Utc::now();
535 let spdx_license_list_version = active_license_engine
536 .as_ref()
537 .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
538 .or(imported_spdx_license_list_version)
539 .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
540 let license_index_provenance = active_license_engine
541 .as_ref()
542 .and_then(|engine| engine.license_index_provenance().cloned())
543 .or(imported_license_index_provenance);
544
545 progress.finalize_step("Preparing output...");
546 let output = record_detail_timing(&progress, "finalize:output-prepare", || {
547 create_output(
548 start_time,
549 end_time,
550 scan_result,
551 CreateOutputContext {
552 total_dirs,
553 assembly_result,
554 license_detections,
555 license_references,
556 license_rule_references,
557 spdx_license_list_version,
558 license_index_provenance,
559 extra_errors,
560 extra_warnings: Vec::new(),
561 header_options: cli.output_header_options(),
562 options: CreateOutputOptions {
563 facet_rules: &facet_rules,
564 include_classify: cli.classify,
565 include_summary: cli.summary,
566 include_license_clarity_score: cli.license_clarity_score,
567 include_tallies: cli.tallies,
568 include_tallies_of_key_files: cli.tallies_key_files,
569 include_tallies_with_details: cli.tallies_with_details,
570 include_tallies_by_facet: cli.tallies_by_facet,
571 include_generated: cli.generated,
572 verbose: cli.verbose,
573 },
574 },
575 )
576 });
577 progress.finish_finalize();
578
579 let output_schema_output = crate::output_schema::Output::from(&output);
580 progress.start_output();
581 for target in cli.output_targets() {
582 let output_config = OutputWriteConfig {
583 format: target.format,
584 custom_template: target.custom_template.clone(),
585 scanned_path: if cli.dir_path.len() == 1 {
586 cli.dir_path.first().cloned()
587 } else {
588 None
589 },
590 };
591
592 let timing_name = format!("output:{:?}", target.format).to_lowercase();
593 record_detail_timing(&progress, timing_name, || {
594 write_output_file(&target.file, &output_schema_output, &output_config)
595 })?;
596 progress.output_written(&format!(
597 "{:?} output written to {}",
598 target.format, target.file
599 ));
600 }
601 progress.record_final_counts(&output.files);
602 progress.record_final_header_counts(&output.headers);
603 progress.finish_output();
604
605 let summary_end = Utc::now();
606 progress.display_summary(
607 &format_scancode_timestamp(&start_time),
608 &format_scancode_timestamp(&summary_end),
609 );
610
611 Ok(())
612}
613
614fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
615 if from_json {
616 files.clear();
617 } else {
618 apply_only_findings_filter(files);
619 }
620}
621
622fn collect_top_level_license_detections_for_mode(
623 files: &[FileInfo],
624 preloaded: Vec<crate::models::TopLevelLicenseDetection>,
625 preserve_preloaded: bool,
626 clear_for_multi_input_replay: bool,
627) -> Vec<crate::models::TopLevelLicenseDetection> {
628 if clear_for_multi_input_replay {
629 Vec::new()
630 } else if preserve_preloaded {
631 preloaded
632 } else {
633 collect_top_level_license_detections(files)
634 }
635}
636
637#[cfg(feature = "golden-tests")]
638fn touch_license_golden_symbols() {
639 let _ = crate::license_detection::golden_utils::read_golden_input_content;
640 let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
641 let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
642 let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
643}
644
645fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
646 if cli.show_attribution {
647 return Ok(());
648 }
649
650 if cli.export_license_dataset.is_some() {
651 if !cli.dir_path.is_empty() {
652 return Err(anyhow!(
653 "--export-license-dataset does not accept scan input paths"
654 ));
655 }
656
657 if cli.from_json
658 || cli.license
659 || cli.package
660 || cli.system_package
661 || cli.package_in_compiled
662 || cli.package_only
663 || cli.copyright
664 || cli.email
665 || cli.url
666 || cli.generated
667 || cli.info
668 || cli.incremental
669 || cli.reindex
670 || cli.no_license_index_cache
671 || cli.license_dataset_path.is_some()
672 {
673 return Err(anyhow!(
674 "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
675 ));
676 }
677
678 return Ok(());
679 }
680
681 if cli.from_json
682 && (cli.package
683 || cli.system_package
684 || cli.package_in_compiled
685 || cli.package_only
686 || cli.copyright
687 || cli.email
688 || cli.url
689 || cli.generated)
690 {
691 return Err(anyhow!(
692 "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
693 ));
694 }
695
696 if cli.from_json && cli.incremental {
697 return Err(anyhow!(
698 "--incremental is only supported for directory scan mode, not --from-json"
699 ));
700 }
701
702 if !cli.from_json && cli.dir_path.is_empty() {
703 return Err(anyhow!("Directory path is required for scan operations"));
704 }
705
706 if cli.tallies_by_facet && cli.facet.is_empty() {
707 return Err(anyhow!(
708 "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
709 ));
710 }
711
712 if cli.mark_source && !cli.info {
713 return Err(anyhow!("--mark-source requires --info"));
714 }
715
716 Ok(())
717}
718
719fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
720 let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
721 let config = CacheConfig::from_overrides(
722 scan_root,
723 cli.cache_dir.as_deref().map(Path::new),
724 env_cache_dir.as_deref(),
725 cli.incremental,
726 );
727
728 if cli.cache_clear {
729 crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
730 config.clear_contents()
731 })?;
732 }
733
734 if config.incremental_enabled() {
735 config.ensure_dirs()?;
736 }
737
738 Ok(config)
739}
740
741fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
742 LicenseCacheConfig::new(
743 cache_root.root_dir().to_path_buf(),
744 cli.reindex,
745 !cli.no_license_index_cache,
746 )
747}
748
749fn partition_incremental_files(
750 collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
751 scan_root: &Path,
752 manifest: Option<&IncrementalManifest>,
753) -> Vec<FileInfo> {
754 let Some(manifest) = manifest else {
755 return Vec::new();
756 };
757
758 let mut files_to_scan = Vec::new();
759 let mut reused_files = Vec::new();
760
761 for (path, metadata) in collected_files.drain(..) {
762 let relative_path = normalize_relative_scan_path(&path, scan_root);
763 let Some(entry) = manifest.entry(&relative_path) else {
764 files_to_scan.push((path, metadata));
765 continue;
766 };
767
768 match manifest_entry_matches_path(entry, &path, &metadata) {
769 Ok(true) => reused_files.push(entry.file_info.clone()),
770 Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
771 }
772 }
773
774 *collected_files = files_to_scan;
775 reused_files
776}
777
778fn merge_incremental_file_results(
779 processed_files: Vec<FileInfo>,
780 reused_files: Vec<FileInfo>,
781 ordered_file_paths: &[PathBuf],
782) -> Vec<FileInfo> {
783 let mut processed_file_entries = HashMap::new();
784 let mut directory_entries = Vec::new();
785 for file in processed_files {
786 if file.file_type == FileType::File {
787 processed_file_entries.insert(file.path.clone(), file);
788 } else {
789 directory_entries.push(file);
790 }
791 }
792
793 let mut reused_file_entries: HashMap<_, _> = reused_files
794 .into_iter()
795 .map(|file| (file.path.clone(), file))
796 .collect();
797
798 let mut merged_files = Vec::new();
799 for path in ordered_file_paths {
800 let path_string = path.to_string_lossy().to_string();
801 if let Some(file) = processed_file_entries.remove(&path_string) {
802 merged_files.push(file);
803 continue;
804 }
805
806 if let Some(file) = reused_file_entries.remove(&path_string) {
807 merged_files.push(file);
808 }
809 }
810
811 merged_files.extend(processed_file_entries.into_values());
812 merged_files.extend(reused_file_entries.into_values());
813 merged_files.extend(directory_entries);
814 merged_files
815}
816
817fn build_incremental_manifest(
818 scan_root: &Path,
819 collected_files: &[(PathBuf, fs::Metadata)],
820 files: &[FileInfo],
821 options_fingerprint: &str,
822) -> IncrementalManifest {
823 let files_by_relative_path: HashMap<_, _> = files
824 .iter()
825 .filter(|file| file.file_type == FileType::File)
826 .map(|file| {
827 (
828 normalize_relative_scan_path(Path::new(&file.path), scan_root),
829 file.clone(),
830 )
831 })
832 .collect();
833
834 let entries = collected_files
835 .iter()
836 .filter_map(|(path, metadata)| {
837 let relative_path = normalize_relative_scan_path(path, scan_root);
838 let state = metadata_fingerprint(metadata)?;
839 let file_info = files_by_relative_path.get(&relative_path)?.clone();
840 let content_sha256 = file_info.sha256.unwrap_or_else(|| {
841 fs::read(path)
842 .map(|bytes| calculate_sha256(&bytes))
843 .unwrap_or_else(|_| {
844 Sha256Digest::from_hex(
845 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
846 )
847 .unwrap()
848 })
849 });
850 Some((
851 relative_path,
852 IncrementalManifestEntry {
853 state,
854 content_sha256,
855 file_info,
856 },
857 ))
858 })
859 .collect::<BTreeMap<_, _>>();
860
861 IncrementalManifest::new(options_fingerprint.to_string(), entries)
862}
863
864fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
865 let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
866 calculate_sha256(
867 format!(
868 "{}\n{options_fingerprint}",
869 canonical_root.to_string_lossy()
870 )
871 .as_bytes(),
872 )
873 .as_hex()
874}
875
876fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
877 path.strip_prefix(scan_root)
878 .unwrap_or(path)
879 .to_string_lossy()
880 .replace('\\', "/")
881}
882
883fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
884 patterns
885 .iter()
886 .map(|pattern| {
887 Regex::new(pattern).map_err(|err| {
888 anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
889 })
890 })
891 .collect()
892}
893
894fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
895 match process_mode {
896 ProcessMode::SequentialWithoutTimeouts => 0.0,
897 ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
898 }
899}
900
901fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
902 match process_mode {
903 ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
904 ProcessMode::SequentialWithoutTimeouts => {
905 Some("Disabling multi-processing and multi-threading for debugging.")
906 }
907 ProcessMode::Parallel(_) => None,
908 }
909}
910
911fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
912 if cli.quiet {
913 ProgressMode::Quiet
914 } else if cli.verbose {
915 ProgressMode::Verbose
916 } else {
917 ProgressMode::Default
918 }
919}
920
921fn configured_scan_names(cli: &Cli) -> String {
922 let mut names = Vec::new();
923 if cli.license {
924 names.push("licenses");
925 }
926 if cli.info {
927 names.push("info");
928 }
929 if cli.package {
930 names.push("packages");
931 }
932 if (cli.system_package || cli.package_in_compiled || cli.package_only)
933 && !names.contains(&"packages")
934 {
935 names.push("packages");
936 }
937 if cli.copyright {
938 names.push("copyrights");
939 }
940 if cli.email {
941 names.push("emails");
942 }
943 if cli.url {
944 names.push("urls");
945 }
946 names.join(", ")
947}
948
949fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
950 cli.info
951 || files.iter().any(|file| {
952 file.date.is_some()
953 || file.sha1.is_some()
954 || file.md5.is_some()
955 || file.sha256.is_some()
956 || file.sha1_git.is_some()
957 || file.mime_type.is_some()
958 || file.file_type_label.is_some()
959 || file.programming_language.is_some()
960 || file.is_binary.is_some()
961 || file.is_text.is_some()
962 || file.is_archive.is_some()
963 || file.is_media.is_some()
964 || file.is_source.is_some()
965 || file.is_script.is_some()
966 || file.files_count.is_some()
967 || file.dirs_count.is_some()
968 || file.size_count.is_some()
969 })
970}
971
972fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
973where
974 F: FnOnce() -> T,
975{
976 let started = Instant::now();
977 let result = f();
978 progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
979 result
980}
981
982fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
983where
984 F: FnOnce() -> Result<T> + Send,
985 T: Send,
986{
987 let pool = rayon::ThreadPoolBuilder::new()
988 .num_threads(threads.max(1))
989 .build()?;
990 pool.install(f)
991}
992
993fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
994 let cache_config = build_license_cache_config(cache_root, cli);
995
996 match &cli.license_dataset_path {
997 Some(p) => {
998 let path = PathBuf::from(p);
999 if !path.exists() {
1000 return Err(anyhow!("License dataset path does not exist: {:?}", path));
1001 }
1002 let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1003 Ok(Arc::new(engine))
1004 }
1005 None => {
1006 let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1007 Ok(Arc::new(engine))
1008 }
1009 }
1010}
1011
1012fn describe_license_engine_source(
1013 engine: &LicenseDetectionEngine,
1014 rules_path: Option<&str>,
1015) -> String {
1016 match rules_path {
1017 Some(path) => format!(
1018 "License detection engine initialized with {} rules from custom dataset {}",
1019 engine.index().rules_by_rid.len(),
1020 path
1021 ),
1022 None => format!(
1023 "License detection engine initialized with {} rules from embedded artifact",
1024 engine.index().rules_by_rid.len()
1025 ),
1026 }
1027}
1028
1029#[cfg(test)]
1030mod tests;