Skip to main content

provenant/cli/run/
mod.rs

1use crate::assembly;
2use crate::cache::{
3    CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
4    build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
5    manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
6};
7use crate::cli::{Cli, ProcessMode};
8use crate::license_detection::LicenseDetectionEngine;
9use crate::license_detection::dataset::export_embedded_license_dataset;
10use crate::license_detection::license_cache::LicenseCacheConfig;
11use crate::models::{FileInfo, FileType, Sha256Digest};
12use crate::output::{OutputWriteConfig, write_output_file};
13use crate::post_processing::{
14    CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
15    apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
16    collect_top_level_license_detections, collect_top_level_license_references, create_output,
17};
18use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
19use crate::scan_result_shaping::{
20    apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
21    apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
22    filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
23    normalize_top_level_output_paths, populate_info_resource_counts,
24    prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, trim_preloaded_assembly_to_files,
25};
26use crate::scanner::{
27    LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
28    process_collected_with_memory_limit_sequential, scan_options_fingerprint,
29};
30use crate::time::format_scancode_timestamp;
31use crate::utils::hash::calculate_sha256;
32use anyhow::{Result, anyhow};
33use chrono::Utc;
34use clap::Parser;
35use regex::Regex;
36use std::collections::{BTreeMap, HashMap};
37use std::env;
38use std::fs;
39use std::path::{Path, PathBuf};
40use std::sync::Arc;
41use std::time::Instant;
42
43pub fn run() -> Result<()> {
44    #[cfg(feature = "golden-tests")]
45    touch_license_golden_symbols();
46
47    let cli = Cli::parse();
48
49    validate_scan_option_compatibility(&cli)?;
50
51    if cli.show_attribution {
52        print!("{}", include_str!("../../../NOTICE"));
53        return Ok(());
54    }
55
56    if let Some(export_dir) = cli.export_license_dataset.as_deref() {
57        export_embedded_license_dataset(Path::new(export_dir))?;
58        return Ok(());
59    }
60
61    let start_time = Utc::now();
62    let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
63    progress.set_processes(cli.processes);
64    progress.set_scan_names(configured_scan_names(&cli));
65    progress.init_logging_bridge();
66    let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
67
68    progress.start_setup();
69    let facet_rules = build_facet_rules(&cli.facet)?;
70
71    let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
72    let ignore_copyright_holder_patterns =
73        compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
74    progress.finish_setup();
75
76    progress.start_discovery();
77
78    let mut shared_cache_config = if cli.from_json {
79        let cache_config = prepare_cache_config(None, &cli)?;
80        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
81        Some(cache_config)
82    } else {
83        None
84    };
85
86    let (
87        mut scan_result,
88        total_dirs,
89        mut preloaded_assembly,
90        preloaded_license_detections,
91        preloaded_license_references,
92        preloaded_license_rule_references,
93        preloaded_extra_errors,
94        imported_spdx_license_list_version,
95        imported_license_index_provenance,
96        mut active_license_engine,
97    ) = if cli.from_json {
98        let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
99        let directories_count = loaded.directory_count();
100        let files_count = loaded.file_count();
101        let size_count = loaded.file_size_count();
102        progress.finish_discovery(
103            files_count,
104            directories_count,
105            size_count,
106            loaded.excluded_count,
107        );
108        let (
109            process_result,
110            assembly_result,
111            license_detections,
112            license_references,
113            license_rule_references,
114            extra_errors,
115            imported_spdx_license_list_version,
116            imported_license_index_provenance,
117        ) = loaded.into_parts()?;
118        (
119            process_result,
120            directories_count,
121            assembly_result,
122            license_detections,
123            license_references,
124            license_rule_references,
125            extra_errors,
126            imported_spdx_license_list_version,
127            imported_license_index_provenance,
128            None,
129        )
130    } else {
131        let (scan_path, native_input_includes) = resolve_native_scan_inputs(&cli.dir_path)?;
132        let mut native_include_patterns = cli.include.clone();
133        native_include_patterns.extend(native_input_includes);
134
135        let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
136        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
137        shared_cache_config = Some(cache_config.clone());
138        let collection_exclude_patterns =
139            build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
140
141        let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
142        let user_excluded_count = apply_user_path_filters_to_collected(
143            &mut collected,
144            Path::new(&scan_path),
145            &native_include_patterns,
146            &cli.exclude,
147        );
148        let total_files = collected.file_count();
149        let total_dirs = collected.directory_count();
150        let total_size = collected.total_file_bytes;
151        let excluded_count = collected.excluded_count + user_excluded_count;
152        let all_collected_files = collected.files.clone();
153        let ordered_file_paths: Vec<PathBuf> = collected
154            .files
155            .iter()
156            .map(|(path, _)| path.clone())
157            .collect();
158        let runtime_errors = collected
159            .collection_errors
160            .iter()
161            .map(|(path, err)| format_default_scan_error(path, err))
162            .collect();
163        for (path, err) in &collected.collection_errors {
164            progress.record_runtime_error(path, err);
165        }
166        progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
167        if !cli.quiet {
168            progress.output_written(&format!(
169                "Found {} files in {} directories ({} items excluded)",
170                total_files, total_dirs, excluded_count
171            ));
172        }
173
174        let license_engine = if cli.license {
175            progress.start_setup();
176            progress.start_license_detection_engine_creation();
177            let engine = init_license_engine(
178                shared_cache_config
179                    .as_ref()
180                    .expect("cache config should be prepared before license engine init"),
181                &cli,
182            )?;
183            progress.finish_license_detection_engine_creation("setup_scan:licenses");
184            progress.finish_setup();
185            progress.output_written(&describe_license_engine_source(
186                &engine,
187                cli.license_dataset_path.as_deref(),
188            ));
189            Some(engine)
190        } else {
191            None
192        };
193
194        let enable_application_packages = cli.package || cli.package_only;
195        let enable_system_packages = cli.system_package || cli.package_only;
196        let enable_packages =
197            enable_application_packages || enable_system_packages || cli.package_in_compiled;
198        let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
199        {
200            (false, cli.email, cli.url, cli.generated)
201        } else {
202            (cli.copyright, cli.email, cli.url, cli.generated)
203        };
204        let process_mode = cli.processes;
205
206        let text_options = TextDetectionOptions {
207            collect_info: cli.info,
208            detect_packages: enable_packages,
209            detect_application_packages: enable_application_packages,
210            detect_system_packages: enable_system_packages,
211            detect_packages_in_compiled: cli.package_in_compiled,
212            detect_copyrights,
213            detect_generated,
214            detect_emails,
215            detect_urls,
216            max_emails: cli.max_email,
217            max_urls: cli.max_url,
218            timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
219        };
220
221        let license_options = LicenseScanOptions {
222            include_text: cli.license_text,
223            include_text_diagnostics: cli.license_text_diagnostics,
224            include_diagnostics: cli.license_diagnostics,
225            unknown_licenses: cli.unknown_licenses,
226            min_score: cli.license_score,
227        };
228        let options_fingerprint =
229            scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
230
231        if cli.incremental {
232            let manifest_path = incremental_manifest_path(
233                cache_config.root_dir(),
234                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
235            );
236            let previous_manifest =
237                load_incremental_manifest(&manifest_path, &options_fingerprint)?;
238            let reused_files = partition_incremental_files(
239                &mut collected.files,
240                Path::new(&scan_path),
241                previous_manifest.as_ref(),
242            );
243            progress.record_incremental_reused(reused_files.len());
244        }
245
246        if let Some(message) = process_mode_message(process_mode) {
247            progress.output_written(message);
248        }
249        progress.start_scan(collected.file_count());
250        let mut result = match process_mode {
251            ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
252                Ok(process_collected_with_memory_limit(
253                    &collected,
254                    Arc::clone(&progress),
255                    license_engine.clone(),
256                    license_options,
257                    &text_options,
258                    cli.max_in_memory,
259                ))
260            })?,
261            ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
262                process_collected_with_memory_limit_sequential(
263                    &collected,
264                    Arc::clone(&progress),
265                    license_engine.clone(),
266                    license_options,
267                    &text_options,
268                    cli.max_in_memory,
269                )
270            }
271        };
272
273        if cli.incremental {
274            let manifest_path = incremental_manifest_path(
275                cache_config.root_dir(),
276                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
277            );
278            let reused_files = partition_incremental_files(
279                &mut all_collected_files.clone(),
280                Path::new(&scan_path),
281                load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
282            );
283            result.files =
284                merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
285
286            let manifest = build_incremental_manifest(
287                Path::new(&scan_path),
288                &all_collected_files,
289                &result.files,
290                &options_fingerprint,
291            );
292            write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
293        }
294
295        result.excluded_count = excluded_count;
296        progress.finish_scan();
297
298        (
299            result,
300            total_dirs,
301            assembly::AssemblyResult {
302                packages: Vec::new(),
303                dependencies: Vec::new(),
304            },
305            Vec::new(),
306            Vec::new(),
307            Vec::new(),
308            runtime_errors,
309            None,
310            None,
311            license_engine,
312        )
313    };
314
315    progress.start_post_scan();
316
317    if cli.filter_clues {
318        progress.post_scan_step("Filtering redundant clues...");
319        let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
320            prepare_filter_clue_rule_lookup(
321                &scan_result.files,
322                active_license_engine.as_deref(),
323                cli.license_dataset_path.as_deref(),
324                shared_license_cache_config.as_ref(),
325            )
326        })?;
327        if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
328            filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
329        } else {
330            filter_redundant_clues(&mut scan_result.files);
331        }
332    }
333
334    if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
335        progress.post_scan_step("Applying ignore-resource filters...");
336        record_detail_timing(&progress, "post-scan:ignore-resource", || {
337            apply_ignore_resource_filter(
338                &mut scan_result.files,
339                &ignore_copyright_holder_patterns,
340                &ignore_author_patterns,
341            );
342        });
343    }
344
345    if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
346        progress.post_scan_step("Applying path selection filters...");
347        record_detail_timing(&progress, "output-filter:path-selection", || {
348            apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
349        });
350    }
351
352    if cli.only_findings {
353        progress.post_scan_step("Filtering to files with findings...");
354        record_detail_timing(&progress, "output-filter:only-findings", || {
355            apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
356        });
357    }
358
359    if cli.info && cli.mark_source {
360        progress.post_scan_step("Marking source files...");
361        record_detail_timing(&progress, "post-scan:mark-source", || {
362            apply_mark_source(&mut scan_result.files);
363        });
364    }
365
366    if should_include_info_surface(&scan_result.files, &cli) {
367        progress.post_scan_step("Populating info resource counts...");
368        record_detail_timing(&progress, "post-scan:info-resource-counts", || {
369            populate_info_resource_counts(&mut scan_result.files);
370        });
371    }
372
373    progress.post_scan_step("Backfilling license provenance...");
374    record_detail_timing(&progress, "post-scan:license-provenance", || {
375        for file in &mut scan_result.files {
376            file.backfill_license_provenance();
377        }
378    });
379
380    if cli.from_json {
381        for err in &preloaded_extra_errors {
382            progress.record_additional_error(err);
383        }
384    }
385
386    let mut extra_errors = preloaded_extra_errors;
387    if let Some(policy_path) = cli.license_policy.as_deref() {
388        progress.post_scan_step("Applying license policy...");
389        let license_policy_errors =
390            record_detail_timing(&progress, "post-scan:license-policy", || {
391                apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
392            })?;
393        for err in &license_policy_errors {
394            progress.record_additional_error(err);
395        }
396        extra_errors.extend(license_policy_errors);
397    }
398
399    if cli.from_json {
400        progress.post_scan_step("Trimming preloaded assembly to filtered files...");
401        record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
402            trim_preloaded_assembly_to_files(
403                &scan_result.files,
404                &mut preloaded_assembly.packages,
405                &mut preloaded_assembly.dependencies,
406            );
407        });
408    }
409
410    progress.finish_post_scan();
411
412    let manifests_seen = scan_result
413        .files
414        .iter()
415        .map(|file| file.package_data.len())
416        .sum();
417    let skip_assembly = cli.no_assemble || cli.package_only;
418
419    let mut assembly_result = if skip_assembly {
420        assembly::AssemblyResult {
421            packages: Vec::new(),
422            dependencies: Vec::new(),
423        }
424    } else {
425        progress.start_assembly();
426
427        let mut result = if cli.from_json
428            && (!preloaded_assembly.packages.is_empty()
429                || !preloaded_assembly.dependencies.is_empty())
430        {
431            progress.assembly_step("Using preloaded assembly...");
432            preloaded_assembly
433        } else {
434            assembly::assemble(&mut scan_result.files)
435        };
436
437        progress.assembly_step("Backfilling package license provenance...");
438        record_detail_timing(&progress, "assembly:package-license-provenance", || {
439            for package in &mut result.packages {
440                package.backfill_license_provenance();
441            }
442        });
443
444        progress.assembly_step("Applying package reference following...");
445        record_detail_timing(&progress, "assembly:package-reference-following", || {
446            apply_package_reference_following(&mut scan_result.files, &mut result.packages);
447        });
448
449        progress.finish_assembly(result.packages.len(), manifests_seen);
450        result
451    };
452
453    progress.start_finalize();
454
455    if !cli.from_json && (cli.strip_root || cli.full_root) {
456        let root_path = cli
457            .dir_path
458            .first()
459            .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
460        progress.finalize_step("Normalizing paths...");
461        record_detail_timing(&progress, "finalize:path-normalization", || {
462            normalize_paths(
463                &mut scan_result.files,
464                root_path,
465                cli.strip_root,
466                cli.full_root,
467            );
468            normalize_top_level_output_paths(
469                &mut assembly_result.packages,
470                &mut assembly_result.dependencies,
471                root_path,
472                cli.strip_root,
473            );
474        });
475    }
476
477    progress.finalize_step("Collecting license detections...");
478    let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
479        let preserve_preloaded_top_level_detections = cli.from_json
480            && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
481        collect_top_level_license_detections_for_mode(
482            &scan_result.files,
483            preloaded_license_detections,
484            preserve_preloaded_top_level_detections,
485            cli.from_json && cli.dir_path.len() > 1,
486        )
487    });
488
489    let should_recompute_license_references = cli.from_json
490        && (!preloaded_license_references.is_empty()
491            || !preloaded_license_rule_references.is_empty()
492            || cli.license_references
493            || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
494                && !preloaded_license_references.is_empty()));
495
496    if should_recompute_license_references && active_license_engine.is_none() {
497        progress.start_license_detection_engine_creation();
498        active_license_engine = Some(init_license_engine(
499            shared_cache_config
500                .as_ref()
501                .expect("cache config should be prepared before license engine init"),
502            &cli,
503        )?);
504        progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
505    }
506
507    progress.finalize_step("Collecting license references...");
508    let (license_references, license_rule_references) =
509        record_detail_timing(&progress, "finalize:license-references", || {
510            if cli.from_json && !should_recompute_license_references {
511                (
512                    preloaded_license_references,
513                    preloaded_license_rule_references,
514                )
515            } else if cli.license_references || should_recompute_license_references {
516                if let Some(engine) = active_license_engine.as_deref() {
517                    collect_top_level_license_references(
518                        &scan_result.files,
519                        &assembly_result.packages,
520                        engine.index(),
521                        &cli.license_url_template,
522                    )
523                } else {
524                    (Vec::new(), Vec::new())
525                }
526            } else {
527                (Vec::new(), Vec::new())
528            }
529        });
530
531    let end_time = Utc::now();
532    let spdx_license_list_version = active_license_engine
533        .as_ref()
534        .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
535        .or(imported_spdx_license_list_version)
536        .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
537    let license_index_provenance = active_license_engine
538        .as_ref()
539        .and_then(|engine| engine.license_index_provenance().cloned())
540        .or(imported_license_index_provenance);
541
542    progress.finalize_step("Preparing output...");
543    let output = record_detail_timing(&progress, "finalize:output-prepare", || {
544        create_output(
545            start_time,
546            end_time,
547            scan_result,
548            CreateOutputContext {
549                total_dirs,
550                assembly_result,
551                license_detections,
552                license_references,
553                license_rule_references,
554                spdx_license_list_version,
555                license_index_provenance,
556                extra_errors,
557                extra_warnings: Vec::new(),
558                header_options: cli.output_header_options(),
559                options: CreateOutputOptions {
560                    facet_rules: &facet_rules,
561                    include_classify: cli.classify,
562                    include_summary: cli.summary,
563                    include_license_clarity_score: cli.license_clarity_score,
564                    include_tallies: cli.tallies,
565                    include_tallies_of_key_files: cli.tallies_key_files,
566                    include_tallies_with_details: cli.tallies_with_details,
567                    include_tallies_by_facet: cli.tallies_by_facet,
568                    include_generated: cli.generated,
569                    verbose: cli.verbose,
570                },
571            },
572        )
573    });
574    progress.finish_finalize();
575
576    let output_schema_output = crate::output_schema::Output::from(&output);
577    progress.start_output();
578    for target in cli.output_targets() {
579        let output_config = OutputWriteConfig {
580            format: target.format,
581            custom_template: target.custom_template.clone(),
582            scanned_path: if cli.dir_path.len() == 1 {
583                cli.dir_path.first().cloned()
584            } else {
585                None
586            },
587        };
588
589        let timing_name = format!("output:{:?}", target.format).to_lowercase();
590        record_detail_timing(&progress, timing_name, || {
591            write_output_file(&target.file, &output_schema_output, &output_config)
592        })?;
593        progress.output_written(&format!(
594            "{:?} output written to {}",
595            target.format, target.file
596        ));
597    }
598    progress.record_final_counts(&output.files);
599    progress.finish_output();
600
601    let summary_end = Utc::now();
602    progress.display_summary(
603        &format_scancode_timestamp(&start_time),
604        &format_scancode_timestamp(&summary_end),
605    );
606
607    Ok(())
608}
609
610fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
611    if from_json {
612        files.clear();
613    } else {
614        apply_only_findings_filter(files);
615    }
616}
617
618fn collect_top_level_license_detections_for_mode(
619    files: &[FileInfo],
620    preloaded: Vec<crate::models::TopLevelLicenseDetection>,
621    preserve_preloaded: bool,
622    clear_for_multi_input_replay: bool,
623) -> Vec<crate::models::TopLevelLicenseDetection> {
624    if clear_for_multi_input_replay {
625        Vec::new()
626    } else if preserve_preloaded {
627        preloaded
628    } else {
629        collect_top_level_license_detections(files)
630    }
631}
632
633#[cfg(feature = "golden-tests")]
634fn touch_license_golden_symbols() {
635    let _ = crate::license_detection::golden_utils::read_golden_input_content;
636    let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
637    let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
638    let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
639}
640
641fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
642    if cli.show_attribution {
643        return Ok(());
644    }
645
646    if cli.export_license_dataset.is_some() {
647        if !cli.dir_path.is_empty() {
648            return Err(anyhow!(
649                "--export-license-dataset does not accept scan input paths"
650            ));
651        }
652
653        if cli.from_json
654            || cli.license
655            || cli.package
656            || cli.system_package
657            || cli.package_in_compiled
658            || cli.package_only
659            || cli.copyright
660            || cli.email
661            || cli.url
662            || cli.generated
663            || cli.info
664            || cli.incremental
665            || cli.reindex
666            || cli.no_license_index_cache
667            || cli.license_dataset_path.is_some()
668        {
669            return Err(anyhow!(
670                "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
671            ));
672        }
673
674        return Ok(());
675    }
676
677    if cli.from_json
678        && (cli.package
679            || cli.system_package
680            || cli.package_in_compiled
681            || cli.package_only
682            || cli.copyright
683            || cli.email
684            || cli.url
685            || cli.generated)
686    {
687        return Err(anyhow!(
688            "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
689        ));
690    }
691
692    if cli.from_json && cli.incremental {
693        return Err(anyhow!(
694            "--incremental is only supported for directory scan mode, not --from-json"
695        ));
696    }
697
698    if !cli.from_json && cli.dir_path.is_empty() {
699        return Err(anyhow!("Directory path is required for scan operations"));
700    }
701
702    if cli.tallies_by_facet && cli.facet.is_empty() {
703        return Err(anyhow!(
704            "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
705        ));
706    }
707
708    if cli.mark_source && !cli.info {
709        return Err(anyhow!("--mark-source requires --info"));
710    }
711
712    Ok(())
713}
714
715fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
716    let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
717    let config = CacheConfig::from_overrides(
718        scan_root,
719        cli.cache_dir.as_deref().map(Path::new),
720        env_cache_dir.as_deref(),
721        cli.incremental,
722    );
723
724    if cli.cache_clear {
725        crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
726            config.clear_contents()
727        })?;
728    }
729
730    if config.incremental_enabled() {
731        config.ensure_dirs()?;
732    }
733
734    Ok(config)
735}
736
737fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
738    LicenseCacheConfig::new(
739        cache_root.root_dir().to_path_buf(),
740        cli.reindex,
741        !cli.no_license_index_cache,
742    )
743}
744
745fn partition_incremental_files(
746    collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
747    scan_root: &Path,
748    manifest: Option<&IncrementalManifest>,
749) -> Vec<FileInfo> {
750    let Some(manifest) = manifest else {
751        return Vec::new();
752    };
753
754    let mut files_to_scan = Vec::new();
755    let mut reused_files = Vec::new();
756
757    for (path, metadata) in collected_files.drain(..) {
758        let relative_path = normalize_relative_scan_path(&path, scan_root);
759        let Some(entry) = manifest.entry(&relative_path) else {
760            files_to_scan.push((path, metadata));
761            continue;
762        };
763
764        match manifest_entry_matches_path(entry, &path, &metadata) {
765            Ok(true) => reused_files.push(entry.file_info.clone()),
766            Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
767        }
768    }
769
770    *collected_files = files_to_scan;
771    reused_files
772}
773
774fn merge_incremental_file_results(
775    processed_files: Vec<FileInfo>,
776    reused_files: Vec<FileInfo>,
777    ordered_file_paths: &[PathBuf],
778) -> Vec<FileInfo> {
779    let mut processed_file_entries = HashMap::new();
780    let mut directory_entries = Vec::new();
781    for file in processed_files {
782        if file.file_type == FileType::File {
783            processed_file_entries.insert(file.path.clone(), file);
784        } else {
785            directory_entries.push(file);
786        }
787    }
788
789    let mut reused_file_entries: HashMap<_, _> = reused_files
790        .into_iter()
791        .map(|file| (file.path.clone(), file))
792        .collect();
793
794    let mut merged_files = Vec::new();
795    for path in ordered_file_paths {
796        let path_string = path.to_string_lossy().to_string();
797        if let Some(file) = processed_file_entries.remove(&path_string) {
798            merged_files.push(file);
799            continue;
800        }
801
802        if let Some(file) = reused_file_entries.remove(&path_string) {
803            merged_files.push(file);
804        }
805    }
806
807    merged_files.extend(processed_file_entries.into_values());
808    merged_files.extend(reused_file_entries.into_values());
809    merged_files.extend(directory_entries);
810    merged_files
811}
812
813fn build_incremental_manifest(
814    scan_root: &Path,
815    collected_files: &[(PathBuf, fs::Metadata)],
816    files: &[FileInfo],
817    options_fingerprint: &str,
818) -> IncrementalManifest {
819    let files_by_relative_path: HashMap<_, _> = files
820        .iter()
821        .filter(|file| file.file_type == FileType::File)
822        .map(|file| {
823            (
824                normalize_relative_scan_path(Path::new(&file.path), scan_root),
825                file.clone(),
826            )
827        })
828        .collect();
829
830    let entries = collected_files
831        .iter()
832        .filter_map(|(path, metadata)| {
833            let relative_path = normalize_relative_scan_path(path, scan_root);
834            let state = metadata_fingerprint(metadata)?;
835            let file_info = files_by_relative_path.get(&relative_path)?.clone();
836            let content_sha256 = file_info.sha256.unwrap_or_else(|| {
837                fs::read(path)
838                    .map(|bytes| calculate_sha256(&bytes))
839                    .unwrap_or_else(|_| {
840                        Sha256Digest::from_hex(
841                            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
842                        )
843                        .unwrap()
844                    })
845            });
846            Some((
847                relative_path,
848                IncrementalManifestEntry {
849                    state,
850                    content_sha256,
851                    file_info,
852                },
853            ))
854        })
855        .collect::<BTreeMap<_, _>>();
856
857    IncrementalManifest::new(options_fingerprint.to_string(), entries)
858}
859
860fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
861    let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
862    calculate_sha256(
863        format!(
864            "{}\n{options_fingerprint}",
865            canonical_root.to_string_lossy()
866        )
867        .as_bytes(),
868    )
869    .as_hex()
870}
871
872fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
873    path.strip_prefix(scan_root)
874        .unwrap_or(path)
875        .to_string_lossy()
876        .replace('\\', "/")
877}
878
879fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
880    patterns
881        .iter()
882        .map(|pattern| {
883            Regex::new(pattern).map_err(|err| {
884                anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
885            })
886        })
887        .collect()
888}
889
890fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
891    match process_mode {
892        ProcessMode::SequentialWithoutTimeouts => 0.0,
893        ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
894    }
895}
896
897fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
898    match process_mode {
899        ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
900        ProcessMode::SequentialWithoutTimeouts => {
901            Some("Disabling multi-processing and multi-threading for debugging.")
902        }
903        ProcessMode::Parallel(_) => None,
904    }
905}
906
907fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
908    if cli.quiet {
909        ProgressMode::Quiet
910    } else if cli.verbose {
911        ProgressMode::Verbose
912    } else {
913        ProgressMode::Default
914    }
915}
916
917fn configured_scan_names(cli: &Cli) -> String {
918    let mut names = Vec::new();
919    if cli.license {
920        names.push("licenses");
921    }
922    if cli.info {
923        names.push("info");
924    }
925    if cli.package {
926        names.push("packages");
927    }
928    if (cli.system_package || cli.package_in_compiled || cli.package_only)
929        && !names.contains(&"packages")
930    {
931        names.push("packages");
932    }
933    if cli.copyright {
934        names.push("copyrights");
935    }
936    if cli.email {
937        names.push("emails");
938    }
939    if cli.url {
940        names.push("urls");
941    }
942    names.join(", ")
943}
944
945fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
946    cli.info
947        || files.iter().any(|file| {
948            file.date.is_some()
949                || file.sha1.is_some()
950                || file.md5.is_some()
951                || file.sha256.is_some()
952                || file.sha1_git.is_some()
953                || file.mime_type.is_some()
954                || file.file_type_label.is_some()
955                || file.programming_language.is_some()
956                || file.is_binary.is_some()
957                || file.is_text.is_some()
958                || file.is_archive.is_some()
959                || file.is_media.is_some()
960                || file.is_source.is_some()
961                || file.is_script.is_some()
962                || file.files_count.is_some()
963                || file.dirs_count.is_some()
964                || file.size_count.is_some()
965        })
966}
967
968fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
969where
970    F: FnOnce() -> T,
971{
972    let started = Instant::now();
973    let result = f();
974    progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
975    result
976}
977
978fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
979where
980    F: FnOnce() -> Result<T> + Send,
981    T: Send,
982{
983    let pool = rayon::ThreadPoolBuilder::new()
984        .num_threads(threads.max(1))
985        .build()?;
986    pool.install(f)
987}
988
989fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
990    let cache_config = build_license_cache_config(cache_root, cli);
991
992    match &cli.license_dataset_path {
993        Some(p) => {
994            let path = PathBuf::from(p);
995            if !path.exists() {
996                return Err(anyhow!("License dataset path does not exist: {:?}", path));
997            }
998            let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
999            Ok(Arc::new(engine))
1000        }
1001        None => {
1002            let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1003            Ok(Arc::new(engine))
1004        }
1005    }
1006}
1007
1008fn describe_license_engine_source(
1009    engine: &LicenseDetectionEngine,
1010    rules_path: Option<&str>,
1011) -> String {
1012    match rules_path {
1013        Some(path) => format!(
1014            "License detection engine initialized with {} rules from custom dataset {}",
1015            engine.index().rules_by_rid.len(),
1016            path
1017        ),
1018        None => format!(
1019            "License detection engine initialized with {} rules from embedded artifact",
1020            engine.index().rules_by_rid.len()
1021        ),
1022    }
1023}
1024
1025#[cfg(test)]
1026mod tests;