Skip to main content

provenant/cli/run/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::assembly;
5use crate::cache::{
6    CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7    build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8    manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17    CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18    apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19    collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23    apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24    apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25    filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26    normalize_top_level_output_paths, populate_info_resource_counts,
27    prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, trim_preloaded_assembly_to_files,
28};
29use crate::scanner::{
30    LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
31    process_collected_with_memory_limit_sequential, scan_options_fingerprint,
32};
33use crate::time::format_scancode_timestamp;
34use crate::utils::hash::calculate_sha256;
35use anyhow::{Result, anyhow};
36use chrono::Utc;
37use clap::Parser;
38use regex::Regex;
39use std::collections::{BTreeMap, HashMap};
40use std::env;
41use std::fs;
42use std::path::{Path, PathBuf};
43use std::sync::Arc;
44use std::time::Instant;
45
46pub fn run() -> Result<()> {
47    #[cfg(feature = "golden-tests")]
48    touch_license_golden_symbols();
49
50    let cli = Cli::parse();
51
52    validate_scan_option_compatibility(&cli)?;
53
54    if cli.show_attribution {
55        print!("{}", include_str!("../../../NOTICE"));
56        return Ok(());
57    }
58
59    if let Some(export_dir) = cli.export_license_dataset.as_deref() {
60        export_embedded_license_dataset(Path::new(export_dir))?;
61        return Ok(());
62    }
63
64    let start_time = Utc::now();
65    let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
66    progress.set_processes(cli.processes);
67    progress.set_scan_names(configured_scan_names(&cli));
68    progress.init_logging_bridge();
69    let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
70
71    progress.start_setup();
72    let facet_rules = build_facet_rules(&cli.facet)?;
73
74    let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
75    let ignore_copyright_holder_patterns =
76        compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
77    progress.finish_setup();
78
79    progress.start_discovery();
80
81    let mut shared_cache_config = if cli.from_json {
82        let cache_config = prepare_cache_config(None, &cli)?;
83        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
84        Some(cache_config)
85    } else {
86        None
87    };
88
89    let (
90        mut scan_result,
91        total_dirs,
92        mut preloaded_assembly,
93        preloaded_license_detections,
94        preloaded_license_references,
95        preloaded_license_rule_references,
96        preloaded_extra_errors,
97        imported_spdx_license_list_version,
98        imported_license_index_provenance,
99        mut active_license_engine,
100    ) = if cli.from_json {
101        let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
102        let directories_count = loaded.directory_count();
103        let files_count = loaded.file_count();
104        let size_count = loaded.file_size_count();
105        progress.finish_discovery(
106            files_count,
107            directories_count,
108            size_count,
109            loaded.excluded_count,
110        );
111        let (
112            process_result,
113            assembly_result,
114            license_detections,
115            license_references,
116            license_rule_references,
117            extra_errors,
118            imported_spdx_license_list_version,
119            imported_license_index_provenance,
120        ) = loaded.into_parts()?;
121        (
122            process_result,
123            directories_count,
124            assembly_result,
125            license_detections,
126            license_references,
127            license_rule_references,
128            extra_errors,
129            imported_spdx_license_list_version,
130            imported_license_index_provenance,
131            None,
132        )
133    } else {
134        let (scan_path, native_input_includes) = resolve_native_scan_inputs(&cli.dir_path)?;
135        let mut native_include_patterns = cli.include.clone();
136        native_include_patterns.extend(native_input_includes);
137
138        let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
139        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
140        shared_cache_config = Some(cache_config.clone());
141        let collection_exclude_patterns =
142            build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
143
144        let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
145        let user_excluded_count = apply_user_path_filters_to_collected(
146            &mut collected,
147            Path::new(&scan_path),
148            &native_include_patterns,
149            &cli.exclude,
150        );
151        let total_files = collected.file_count();
152        let total_dirs = collected.directory_count();
153        let total_size = collected.total_file_bytes;
154        let excluded_count = collected.excluded_count + user_excluded_count;
155        let all_collected_files = collected.files.clone();
156        let ordered_file_paths: Vec<PathBuf> = collected
157            .files
158            .iter()
159            .map(|(path, _)| path.clone())
160            .collect();
161        let runtime_errors = collected
162            .collection_errors
163            .iter()
164            .map(|(path, err)| format_default_scan_error(path, err))
165            .collect();
166        for (path, err) in &collected.collection_errors {
167            progress.record_runtime_error(path, err);
168        }
169        progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
170        if !cli.quiet {
171            progress.output_written(&format!(
172                "Found {} files in {} directories ({} items excluded)",
173                total_files, total_dirs, excluded_count
174            ));
175        }
176
177        let license_engine = if cli.license {
178            progress.start_setup();
179            progress.start_license_detection_engine_creation();
180            let engine = init_license_engine(
181                shared_cache_config
182                    .as_ref()
183                    .expect("cache config should be prepared before license engine init"),
184                &cli,
185            )?;
186            progress.finish_license_detection_engine_creation("setup_scan:licenses");
187            progress.finish_setup();
188            progress.output_written(&describe_license_engine_source(
189                &engine,
190                cli.license_dataset_path.as_deref(),
191            ));
192            Some(engine)
193        } else {
194            None
195        };
196
197        let enable_application_packages = cli.package || cli.package_only;
198        let enable_system_packages = cli.system_package || cli.package_only;
199        let enable_packages =
200            enable_application_packages || enable_system_packages || cli.package_in_compiled;
201        let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
202        {
203            (false, cli.email, cli.url, cli.generated)
204        } else {
205            (cli.copyright, cli.email, cli.url, cli.generated)
206        };
207        let process_mode = cli.processes;
208
209        let text_options = TextDetectionOptions {
210            collect_info: cli.info,
211            detect_packages: enable_packages,
212            detect_application_packages: enable_application_packages,
213            detect_system_packages: enable_system_packages,
214            detect_packages_in_compiled: cli.package_in_compiled,
215            detect_copyrights,
216            detect_generated,
217            detect_emails,
218            detect_urls,
219            max_emails: cli.max_email,
220            max_urls: cli.max_url,
221            timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
222        };
223
224        let license_options = LicenseScanOptions {
225            include_text: cli.license_text,
226            include_text_diagnostics: cli.license_text_diagnostics,
227            include_diagnostics: cli.license_diagnostics,
228            unknown_licenses: cli.unknown_licenses,
229            min_score: cli.license_score,
230        };
231        let options_fingerprint =
232            scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
233
234        if cli.incremental {
235            let manifest_path = incremental_manifest_path(
236                cache_config.root_dir(),
237                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
238            );
239            let previous_manifest =
240                load_incremental_manifest(&manifest_path, &options_fingerprint)?;
241            let reused_files = partition_incremental_files(
242                &mut collected.files,
243                Path::new(&scan_path),
244                previous_manifest.as_ref(),
245            );
246            progress.record_incremental_reused(reused_files.len());
247        }
248
249        if let Some(message) = process_mode_message(process_mode) {
250            progress.output_written(message);
251        }
252        progress.start_scan(collected.file_count());
253        let mut result = match process_mode {
254            ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
255                Ok(process_collected_with_memory_limit(
256                    &collected,
257                    Arc::clone(&progress),
258                    license_engine.clone(),
259                    license_options,
260                    &text_options,
261                    cli.max_in_memory,
262                ))
263            })?,
264            ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
265                process_collected_with_memory_limit_sequential(
266                    &collected,
267                    Arc::clone(&progress),
268                    license_engine.clone(),
269                    license_options,
270                    &text_options,
271                    cli.max_in_memory,
272                )
273            }
274        };
275
276        if cli.incremental {
277            let manifest_path = incremental_manifest_path(
278                cache_config.root_dir(),
279                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
280            );
281            let reused_files = partition_incremental_files(
282                &mut all_collected_files.clone(),
283                Path::new(&scan_path),
284                load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
285            );
286            result.files =
287                merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
288
289            let manifest = build_incremental_manifest(
290                Path::new(&scan_path),
291                &all_collected_files,
292                &result.files,
293                &options_fingerprint,
294            );
295            write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
296        }
297
298        result.excluded_count = excluded_count;
299        progress.finish_scan();
300
301        (
302            result,
303            total_dirs,
304            assembly::AssemblyResult {
305                packages: Vec::new(),
306                dependencies: Vec::new(),
307            },
308            Vec::new(),
309            Vec::new(),
310            Vec::new(),
311            runtime_errors,
312            None,
313            None,
314            license_engine,
315        )
316    };
317
318    progress.start_post_scan();
319
320    if cli.filter_clues {
321        progress.post_scan_step("Filtering redundant clues...");
322        let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
323            prepare_filter_clue_rule_lookup(
324                &scan_result.files,
325                active_license_engine.as_deref(),
326                cli.license_dataset_path.as_deref(),
327                shared_license_cache_config.as_ref(),
328            )
329        })?;
330        if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
331            filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
332        } else {
333            filter_redundant_clues(&mut scan_result.files);
334        }
335    }
336
337    if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
338        progress.post_scan_step("Applying ignore-resource filters...");
339        record_detail_timing(&progress, "post-scan:ignore-resource", || {
340            apply_ignore_resource_filter(
341                &mut scan_result.files,
342                &ignore_copyright_holder_patterns,
343                &ignore_author_patterns,
344            );
345        });
346    }
347
348    if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
349        progress.post_scan_step("Applying path selection filters...");
350        record_detail_timing(&progress, "output-filter:path-selection", || {
351            apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
352        });
353    }
354
355    if cli.only_findings {
356        progress.post_scan_step("Filtering to files with findings...");
357        record_detail_timing(&progress, "output-filter:only-findings", || {
358            apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
359        });
360    }
361
362    if cli.info && cli.mark_source {
363        progress.post_scan_step("Marking source files...");
364        record_detail_timing(&progress, "post-scan:mark-source", || {
365            apply_mark_source(&mut scan_result.files);
366        });
367    }
368
369    if should_include_info_surface(&scan_result.files, &cli) {
370        progress.post_scan_step("Populating info resource counts...");
371        record_detail_timing(&progress, "post-scan:info-resource-counts", || {
372            populate_info_resource_counts(&mut scan_result.files);
373        });
374    }
375
376    progress.post_scan_step("Backfilling license provenance...");
377    record_detail_timing(&progress, "post-scan:license-provenance", || {
378        for file in &mut scan_result.files {
379            file.backfill_license_provenance();
380        }
381    });
382
383    if cli.from_json {
384        for err in &preloaded_extra_errors {
385            progress.record_additional_error(err);
386        }
387    }
388
389    let mut extra_errors = preloaded_extra_errors;
390    if let Some(policy_path) = cli.license_policy.as_deref() {
391        progress.post_scan_step("Applying license policy...");
392        let license_policy_errors =
393            record_detail_timing(&progress, "post-scan:license-policy", || {
394                apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
395            })?;
396        for err in &license_policy_errors {
397            progress.record_additional_error(err);
398        }
399        extra_errors.extend(license_policy_errors);
400    }
401
402    if cli.from_json {
403        progress.post_scan_step("Trimming preloaded assembly to filtered files...");
404        record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
405            trim_preloaded_assembly_to_files(
406                &scan_result.files,
407                &mut preloaded_assembly.packages,
408                &mut preloaded_assembly.dependencies,
409            );
410        });
411    }
412
413    progress.finish_post_scan();
414
415    let manifests_seen = scan_result
416        .files
417        .iter()
418        .map(|file| file.package_data.len())
419        .sum();
420    let skip_assembly = cli.no_assemble || cli.package_only;
421
422    let mut assembly_result = if skip_assembly {
423        assembly::AssemblyResult {
424            packages: Vec::new(),
425            dependencies: Vec::new(),
426        }
427    } else {
428        progress.start_assembly();
429
430        let mut result = if cli.from_json
431            && (!preloaded_assembly.packages.is_empty()
432                || !preloaded_assembly.dependencies.is_empty())
433        {
434            progress.assembly_step("Using preloaded assembly...");
435            preloaded_assembly
436        } else {
437            assembly::assemble(&mut scan_result.files)
438        };
439
440        progress.assembly_step("Backfilling package license provenance...");
441        record_detail_timing(&progress, "assembly:package-license-provenance", || {
442            for package in &mut result.packages {
443                package.backfill_license_provenance();
444            }
445        });
446
447        progress.assembly_step("Applying package reference following...");
448        record_detail_timing(&progress, "assembly:package-reference-following", || {
449            apply_package_reference_following(&mut scan_result.files, &mut result.packages);
450        });
451
452        progress.finish_assembly(result.packages.len(), manifests_seen);
453        result
454    };
455
456    progress.start_finalize();
457
458    if !cli.from_json && (cli.strip_root || cli.full_root) {
459        let root_path = cli
460            .dir_path
461            .first()
462            .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
463        progress.finalize_step("Normalizing paths...");
464        record_detail_timing(&progress, "finalize:path-normalization", || {
465            normalize_paths(
466                &mut scan_result.files,
467                root_path,
468                cli.strip_root,
469                cli.full_root,
470            );
471            normalize_top_level_output_paths(
472                &mut assembly_result.packages,
473                &mut assembly_result.dependencies,
474                root_path,
475                cli.strip_root,
476            );
477        });
478    }
479
480    progress.finalize_step("Collecting license detections...");
481    let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
482        let preserve_preloaded_top_level_detections = cli.from_json
483            && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
484        collect_top_level_license_detections_for_mode(
485            &scan_result.files,
486            preloaded_license_detections,
487            preserve_preloaded_top_level_detections,
488            cli.from_json && cli.dir_path.len() > 1,
489        )
490    });
491
492    let should_recompute_license_references = cli.from_json
493        && (!preloaded_license_references.is_empty()
494            || !preloaded_license_rule_references.is_empty()
495            || cli.license_references
496            || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
497                && !preloaded_license_references.is_empty()));
498
499    if should_recompute_license_references && active_license_engine.is_none() {
500        progress.start_license_detection_engine_creation();
501        active_license_engine = Some(init_license_engine(
502            shared_cache_config
503                .as_ref()
504                .expect("cache config should be prepared before license engine init"),
505            &cli,
506        )?);
507        progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
508    }
509
510    progress.finalize_step("Collecting license references...");
511    let (license_references, license_rule_references) =
512        record_detail_timing(&progress, "finalize:license-references", || {
513            if cli.from_json && !should_recompute_license_references {
514                (
515                    preloaded_license_references,
516                    preloaded_license_rule_references,
517                )
518            } else if cli.license_references || should_recompute_license_references {
519                if let Some(engine) = active_license_engine.as_deref() {
520                    collect_top_level_license_references(
521                        &scan_result.files,
522                        &assembly_result.packages,
523                        engine.index(),
524                        &cli.license_url_template,
525                    )
526                } else {
527                    (Vec::new(), Vec::new())
528                }
529            } else {
530                (Vec::new(), Vec::new())
531            }
532        });
533
534    let end_time = Utc::now();
535    let spdx_license_list_version = active_license_engine
536        .as_ref()
537        .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
538        .or(imported_spdx_license_list_version)
539        .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
540    let license_index_provenance = active_license_engine
541        .as_ref()
542        .and_then(|engine| engine.license_index_provenance().cloned())
543        .or(imported_license_index_provenance);
544
545    progress.finalize_step("Preparing output...");
546    let output = record_detail_timing(&progress, "finalize:output-prepare", || {
547        create_output(
548            start_time,
549            end_time,
550            scan_result,
551            CreateOutputContext {
552                total_dirs,
553                assembly_result,
554                license_detections,
555                license_references,
556                license_rule_references,
557                spdx_license_list_version,
558                license_index_provenance,
559                extra_errors,
560                extra_warnings: Vec::new(),
561                header_options: cli.output_header_options(),
562                options: CreateOutputOptions {
563                    facet_rules: &facet_rules,
564                    include_classify: cli.classify,
565                    include_summary: cli.summary,
566                    include_license_clarity_score: cli.license_clarity_score,
567                    include_tallies: cli.tallies,
568                    include_tallies_of_key_files: cli.tallies_key_files,
569                    include_tallies_with_details: cli.tallies_with_details,
570                    include_tallies_by_facet: cli.tallies_by_facet,
571                    include_generated: cli.generated,
572                    verbose: cli.verbose,
573                },
574            },
575        )
576    });
577    progress.finish_finalize();
578
579    let output_schema_output = crate::output_schema::Output::from(&output);
580    progress.start_output();
581    for target in cli.output_targets() {
582        let output_config = OutputWriteConfig {
583            format: target.format,
584            custom_template: target.custom_template.clone(),
585            scanned_path: if cli.dir_path.len() == 1 {
586                cli.dir_path.first().cloned()
587            } else {
588                None
589            },
590        };
591
592        let timing_name = format!("output:{:?}", target.format).to_lowercase();
593        record_detail_timing(&progress, timing_name, || {
594            write_output_file(&target.file, &output_schema_output, &output_config)
595        })?;
596        progress.output_written(&format!(
597            "{:?} output written to {}",
598            target.format, target.file
599        ));
600    }
601    progress.record_final_counts(&output.files);
602    progress.record_final_header_counts(&output.headers);
603    progress.finish_output();
604
605    let summary_end = Utc::now();
606    progress.display_summary(
607        &format_scancode_timestamp(&start_time),
608        &format_scancode_timestamp(&summary_end),
609    );
610
611    Ok(())
612}
613
614fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
615    if from_json {
616        files.clear();
617    } else {
618        apply_only_findings_filter(files);
619    }
620}
621
622fn collect_top_level_license_detections_for_mode(
623    files: &[FileInfo],
624    preloaded: Vec<crate::models::TopLevelLicenseDetection>,
625    preserve_preloaded: bool,
626    clear_for_multi_input_replay: bool,
627) -> Vec<crate::models::TopLevelLicenseDetection> {
628    if clear_for_multi_input_replay {
629        Vec::new()
630    } else if preserve_preloaded {
631        preloaded
632    } else {
633        collect_top_level_license_detections(files)
634    }
635}
636
637#[cfg(feature = "golden-tests")]
638fn touch_license_golden_symbols() {
639    let _ = crate::license_detection::golden_utils::read_golden_input_content;
640    let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
641    let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
642    let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
643}
644
645fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
646    if cli.show_attribution {
647        return Ok(());
648    }
649
650    if cli.export_license_dataset.is_some() {
651        if !cli.dir_path.is_empty() {
652            return Err(anyhow!(
653                "--export-license-dataset does not accept scan input paths"
654            ));
655        }
656
657        if cli.from_json
658            || cli.license
659            || cli.package
660            || cli.system_package
661            || cli.package_in_compiled
662            || cli.package_only
663            || cli.copyright
664            || cli.email
665            || cli.url
666            || cli.generated
667            || cli.info
668            || cli.incremental
669            || cli.reindex
670            || cli.no_license_index_cache
671            || cli.license_dataset_path.is_some()
672        {
673            return Err(anyhow!(
674                "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
675            ));
676        }
677
678        return Ok(());
679    }
680
681    if cli.from_json
682        && (cli.package
683            || cli.system_package
684            || cli.package_in_compiled
685            || cli.package_only
686            || cli.copyright
687            || cli.email
688            || cli.url
689            || cli.generated)
690    {
691        return Err(anyhow!(
692            "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
693        ));
694    }
695
696    if cli.from_json && cli.incremental {
697        return Err(anyhow!(
698            "--incremental is only supported for directory scan mode, not --from-json"
699        ));
700    }
701
702    if !cli.from_json && cli.dir_path.is_empty() {
703        return Err(anyhow!("Directory path is required for scan operations"));
704    }
705
706    if cli.tallies_by_facet && cli.facet.is_empty() {
707        return Err(anyhow!(
708            "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
709        ));
710    }
711
712    if cli.mark_source && !cli.info {
713        return Err(anyhow!("--mark-source requires --info"));
714    }
715
716    Ok(())
717}
718
719fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
720    let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
721    let config = CacheConfig::from_overrides(
722        scan_root,
723        cli.cache_dir.as_deref().map(Path::new),
724        env_cache_dir.as_deref(),
725        cli.incremental,
726    );
727
728    if cli.cache_clear {
729        crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
730            config.clear_contents()
731        })?;
732    }
733
734    if config.incremental_enabled() {
735        config.ensure_dirs()?;
736    }
737
738    Ok(config)
739}
740
741fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
742    LicenseCacheConfig::new(
743        cache_root.root_dir().to_path_buf(),
744        cli.reindex,
745        !cli.no_license_index_cache,
746    )
747}
748
749fn partition_incremental_files(
750    collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
751    scan_root: &Path,
752    manifest: Option<&IncrementalManifest>,
753) -> Vec<FileInfo> {
754    let Some(manifest) = manifest else {
755        return Vec::new();
756    };
757
758    let mut files_to_scan = Vec::new();
759    let mut reused_files = Vec::new();
760
761    for (path, metadata) in collected_files.drain(..) {
762        let relative_path = normalize_relative_scan_path(&path, scan_root);
763        let Some(entry) = manifest.entry(&relative_path) else {
764            files_to_scan.push((path, metadata));
765            continue;
766        };
767
768        match manifest_entry_matches_path(entry, &path, &metadata) {
769            Ok(true) => reused_files.push(entry.file_info.clone()),
770            Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
771        }
772    }
773
774    *collected_files = files_to_scan;
775    reused_files
776}
777
778fn merge_incremental_file_results(
779    processed_files: Vec<FileInfo>,
780    reused_files: Vec<FileInfo>,
781    ordered_file_paths: &[PathBuf],
782) -> Vec<FileInfo> {
783    let mut processed_file_entries = HashMap::new();
784    let mut directory_entries = Vec::new();
785    for file in processed_files {
786        if file.file_type == FileType::File {
787            processed_file_entries.insert(file.path.clone(), file);
788        } else {
789            directory_entries.push(file);
790        }
791    }
792
793    let mut reused_file_entries: HashMap<_, _> = reused_files
794        .into_iter()
795        .map(|file| (file.path.clone(), file))
796        .collect();
797
798    let mut merged_files = Vec::new();
799    for path in ordered_file_paths {
800        let path_string = path.to_string_lossy().to_string();
801        if let Some(file) = processed_file_entries.remove(&path_string) {
802            merged_files.push(file);
803            continue;
804        }
805
806        if let Some(file) = reused_file_entries.remove(&path_string) {
807            merged_files.push(file);
808        }
809    }
810
811    merged_files.extend(processed_file_entries.into_values());
812    merged_files.extend(reused_file_entries.into_values());
813    merged_files.extend(directory_entries);
814    merged_files
815}
816
817fn build_incremental_manifest(
818    scan_root: &Path,
819    collected_files: &[(PathBuf, fs::Metadata)],
820    files: &[FileInfo],
821    options_fingerprint: &str,
822) -> IncrementalManifest {
823    let files_by_relative_path: HashMap<_, _> = files
824        .iter()
825        .filter(|file| file.file_type == FileType::File)
826        .map(|file| {
827            (
828                normalize_relative_scan_path(Path::new(&file.path), scan_root),
829                file.clone(),
830            )
831        })
832        .collect();
833
834    let entries = collected_files
835        .iter()
836        .filter_map(|(path, metadata)| {
837            let relative_path = normalize_relative_scan_path(path, scan_root);
838            let state = metadata_fingerprint(metadata)?;
839            let file_info = files_by_relative_path.get(&relative_path)?.clone();
840            let content_sha256 = file_info.sha256.unwrap_or_else(|| {
841                fs::read(path)
842                    .map(|bytes| calculate_sha256(&bytes))
843                    .unwrap_or_else(|_| {
844                        Sha256Digest::from_hex(
845                            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
846                        )
847                        .unwrap()
848                    })
849            });
850            Some((
851                relative_path,
852                IncrementalManifestEntry {
853                    state,
854                    content_sha256,
855                    file_info,
856                },
857            ))
858        })
859        .collect::<BTreeMap<_, _>>();
860
861    IncrementalManifest::new(options_fingerprint.to_string(), entries)
862}
863
864fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
865    let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
866    calculate_sha256(
867        format!(
868            "{}\n{options_fingerprint}",
869            canonical_root.to_string_lossy()
870        )
871        .as_bytes(),
872    )
873    .as_hex()
874}
875
876fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
877    path.strip_prefix(scan_root)
878        .unwrap_or(path)
879        .to_string_lossy()
880        .replace('\\', "/")
881}
882
883fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
884    patterns
885        .iter()
886        .map(|pattern| {
887            Regex::new(pattern).map_err(|err| {
888                anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
889            })
890        })
891        .collect()
892}
893
894fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
895    match process_mode {
896        ProcessMode::SequentialWithoutTimeouts => 0.0,
897        ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
898    }
899}
900
901fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
902    match process_mode {
903        ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
904        ProcessMode::SequentialWithoutTimeouts => {
905            Some("Disabling multi-processing and multi-threading for debugging.")
906        }
907        ProcessMode::Parallel(_) => None,
908    }
909}
910
911fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
912    if cli.quiet {
913        ProgressMode::Quiet
914    } else if cli.verbose {
915        ProgressMode::Verbose
916    } else {
917        ProgressMode::Default
918    }
919}
920
921fn configured_scan_names(cli: &Cli) -> String {
922    let mut names = Vec::new();
923    if cli.license {
924        names.push("licenses");
925    }
926    if cli.info {
927        names.push("info");
928    }
929    if cli.package {
930        names.push("packages");
931    }
932    if (cli.system_package || cli.package_in_compiled || cli.package_only)
933        && !names.contains(&"packages")
934    {
935        names.push("packages");
936    }
937    if cli.copyright {
938        names.push("copyrights");
939    }
940    if cli.email {
941        names.push("emails");
942    }
943    if cli.url {
944        names.push("urls");
945    }
946    names.join(", ")
947}
948
949fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
950    cli.info
951        || files.iter().any(|file| {
952            file.date.is_some()
953                || file.sha1.is_some()
954                || file.md5.is_some()
955                || file.sha256.is_some()
956                || file.sha1_git.is_some()
957                || file.mime_type.is_some()
958                || file.file_type_label.is_some()
959                || file.programming_language.is_some()
960                || file.is_binary.is_some()
961                || file.is_text.is_some()
962                || file.is_archive.is_some()
963                || file.is_media.is_some()
964                || file.is_source.is_some()
965                || file.is_script.is_some()
966                || file.files_count.is_some()
967                || file.dirs_count.is_some()
968                || file.size_count.is_some()
969        })
970}
971
972fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
973where
974    F: FnOnce() -> T,
975{
976    let started = Instant::now();
977    let result = f();
978    progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
979    result
980}
981
982fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
983where
984    F: FnOnce() -> Result<T> + Send,
985    T: Send,
986{
987    let pool = rayon::ThreadPoolBuilder::new()
988        .num_threads(threads.max(1))
989        .build()?;
990    pool.install(f)
991}
992
993fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
994    let cache_config = build_license_cache_config(cache_root, cli);
995
996    match &cli.license_dataset_path {
997        Some(p) => {
998            let path = PathBuf::from(p);
999            if !path.exists() {
1000                return Err(anyhow!("License dataset path does not exist: {:?}", path));
1001            }
1002            let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1003            Ok(Arc::new(engine))
1004        }
1005        None => {
1006            let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1007            Ok(Arc::new(engine))
1008        }
1009    }
1010}
1011
1012fn describe_license_engine_source(
1013    engine: &LicenseDetectionEngine,
1014    rules_path: Option<&str>,
1015) -> String {
1016    match rules_path {
1017        Some(path) => format!(
1018            "License detection engine initialized with {} rules from custom dataset {}",
1019            engine.index().rules_by_rid.len(),
1020            path
1021        ),
1022        None => format!(
1023            "License detection engine initialized with {} rules from embedded artifact",
1024            engine.index().rules_by_rid.len()
1025        ),
1026    }
1027}
1028
1029#[cfg(test)]
1030mod tests;