Skip to main content

provenant/cli/run/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::assembly;
5use crate::cache::{
6    CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7    build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8    manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17    CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18    apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19    collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23    SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24    apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25    filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26    normalize_top_level_output_paths, populate_info_resource_counts,
27    prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
28    trim_preloaded_assembly_to_files,
29};
30use crate::scanner::{
31    LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit,
32    process_collected_with_memory_limit_sequential, scan_options_fingerprint,
33};
34use crate::time::format_scancode_timestamp;
35use crate::utils::hash::calculate_sha256;
36use anyhow::{Result, anyhow};
37use chrono::Utc;
38use clap::Parser;
39use regex::Regex;
40use std::collections::{BTreeMap, HashMap};
41use std::env;
42use std::fs;
43use std::io::Read;
44use std::path::{Path, PathBuf};
45use std::sync::Arc;
46use std::time::Instant;
47
48pub fn run() -> Result<()> {
49    #[cfg(feature = "golden-tests")]
50    touch_license_golden_symbols();
51
52    let cli = Cli::parse();
53
54    validate_scan_option_compatibility(&cli)?;
55
56    if cli.show_attribution {
57        print!("{}", include_str!("../../../NOTICE"));
58        return Ok(());
59    }
60
61    if let Some(export_dir) = cli.export_license_dataset.as_deref() {
62        export_embedded_license_dataset(Path::new(export_dir))?;
63        return Ok(());
64    }
65
66    let start_time = Utc::now();
67    let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
68    progress.set_processes(cli.processes);
69    progress.set_scan_names(configured_scan_names(&cli));
70    progress.init_logging_bridge();
71    let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
72
73    progress.start_setup();
74    let facet_rules = build_facet_rules(&cli.facet)?;
75
76    let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
77    let ignore_copyright_holder_patterns =
78        compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
79    progress.finish_setup();
80
81    progress.start_discovery();
82
83    let mut shared_cache_config = if cli.from_json {
84        let cache_config = prepare_cache_config(None, &cli)?;
85        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
86        Some(cache_config)
87    } else {
88        None
89    };
90
91    let (
92        mut scan_result,
93        total_dirs,
94        mut preloaded_assembly,
95        preloaded_license_detections,
96        preloaded_license_references,
97        preloaded_license_rule_references,
98        preloaded_extra_errors,
99        extra_warnings,
100        imported_spdx_license_list_version,
101        imported_license_index_provenance,
102        mut active_license_engine,
103    ) = if cli.from_json {
104        let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
105        let directories_count = loaded.directory_count();
106        let files_count = loaded.file_count();
107        let size_count = loaded.file_size_count();
108        progress.finish_discovery(
109            files_count,
110            directories_count,
111            size_count,
112            loaded.excluded_count,
113        );
114        let (
115            process_result,
116            assembly_result,
117            license_detections,
118            license_references,
119            license_rule_references,
120            extra_errors,
121            imported_spdx_license_list_version,
122            imported_license_index_provenance,
123        ) = loaded.into_parts()?;
124        (
125            process_result,
126            directories_count,
127            assembly_result,
128            license_detections,
129            license_references,
130            license_rule_references,
131            extra_errors,
132            Vec::new(),
133            imported_spdx_license_list_version,
134            imported_license_index_provenance,
135            None,
136        )
137    } else {
138        let (scan_path, selected_paths, missing_paths_file_entries) =
139            resolve_native_scan_selection(&cli)?;
140        let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
141        for warning in &paths_file_warnings {
142            progress.output_written(warning);
143        }
144
145        let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
146        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
147        shared_cache_config = Some(cache_config.clone());
148        let collection_exclude_patterns =
149            build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
150
151        let mut collected = collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns);
152        let user_excluded_count = apply_user_path_filters_to_collected(
153            &mut collected,
154            Path::new(&scan_path),
155            &selected_paths,
156            &cli.include,
157            &cli.exclude,
158        );
159        let total_files = collected.file_count();
160        let total_dirs = collected.directory_count();
161        let total_size = collected.total_file_bytes;
162        let excluded_count = collected.excluded_count + user_excluded_count;
163        let all_collected_files = collected.files.clone();
164        let ordered_file_paths: Vec<PathBuf> = collected
165            .files
166            .iter()
167            .map(|(path, _)| path.clone())
168            .collect();
169        let runtime_errors = collected
170            .collection_errors
171            .iter()
172            .map(|(path, err)| format_default_scan_error(path, err))
173            .collect();
174        for (path, err) in &collected.collection_errors {
175            progress.record_runtime_error(path, err);
176        }
177        progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
178        if !cli.quiet {
179            progress.output_written(&format!(
180                "Found {} files in {} directories ({} items excluded)",
181                total_files, total_dirs, excluded_count
182            ));
183        }
184
185        let license_engine = if cli.license {
186            progress.start_setup();
187            progress.start_license_detection_engine_creation();
188            let engine = init_license_engine(
189                shared_cache_config
190                    .as_ref()
191                    .expect("cache config should be prepared before license engine init"),
192                &cli,
193            )?;
194            progress.finish_license_detection_engine_creation("setup_scan:licenses");
195            progress.finish_setup();
196            progress.output_written(&describe_license_engine_source(
197                &engine,
198                cli.license_dataset_path.as_deref(),
199            ));
200            Some(engine)
201        } else {
202            None
203        };
204
205        let enable_application_packages = cli.package || cli.package_only;
206        let enable_system_packages = cli.system_package || cli.package_only;
207        let enable_packages =
208            enable_application_packages || enable_system_packages || cli.package_in_compiled;
209        let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
210        {
211            (false, cli.email, cli.url, cli.generated)
212        } else {
213            (cli.copyright, cli.email, cli.url, cli.generated)
214        };
215        let process_mode = cli.processes;
216
217        let text_options = TextDetectionOptions {
218            collect_info: cli.info,
219            detect_packages: enable_packages,
220            detect_application_packages: enable_application_packages,
221            detect_system_packages: enable_system_packages,
222            detect_packages_in_compiled: cli.package_in_compiled,
223            detect_copyrights,
224            detect_generated,
225            detect_emails,
226            detect_urls,
227            max_emails: cli.max_email,
228            max_urls: cli.max_url,
229            timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
230        };
231
232        let license_options = LicenseScanOptions {
233            include_text: cli.license_text,
234            include_text_diagnostics: cli.license_text_diagnostics,
235            include_diagnostics: cli.license_diagnostics,
236            unknown_licenses: cli.unknown_licenses,
237            min_score: cli.license_score,
238        };
239        let options_fingerprint =
240            scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
241
242        if cli.incremental {
243            let manifest_path = incremental_manifest_path(
244                cache_config.root_dir(),
245                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
246            );
247            let previous_manifest =
248                load_incremental_manifest(&manifest_path, &options_fingerprint)?;
249            let reused_files = partition_incremental_files(
250                &mut collected.files,
251                Path::new(&scan_path),
252                previous_manifest.as_ref(),
253            );
254            progress.record_incremental_reused(reused_files.len());
255        }
256
257        if let Some(message) = process_mode_message(process_mode) {
258            progress.output_written(message);
259        }
260        progress.start_scan(collected.file_count());
261        let mut result = match process_mode {
262            ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
263                Ok(process_collected_with_memory_limit(
264                    &collected,
265                    Arc::clone(&progress),
266                    license_engine.clone(),
267                    license_options,
268                    &text_options,
269                    cli.max_in_memory,
270                ))
271            })?,
272            ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
273                process_collected_with_memory_limit_sequential(
274                    &collected,
275                    Arc::clone(&progress),
276                    license_engine.clone(),
277                    license_options,
278                    &text_options,
279                    cli.max_in_memory,
280                )
281            }
282        };
283
284        if cli.incremental {
285            let manifest_path = incremental_manifest_path(
286                cache_config.root_dir(),
287                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
288            );
289            let reused_files = partition_incremental_files(
290                &mut all_collected_files.clone(),
291                Path::new(&scan_path),
292                load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
293            );
294            result.files =
295                merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
296
297            let manifest = build_incremental_manifest(
298                Path::new(&scan_path),
299                &all_collected_files,
300                &result.files,
301                &options_fingerprint,
302            );
303            write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
304        }
305
306        result.excluded_count = excluded_count;
307        progress.finish_scan();
308
309        (
310            result,
311            total_dirs,
312            assembly::AssemblyResult {
313                packages: Vec::new(),
314                dependencies: Vec::new(),
315            },
316            Vec::new(),
317            Vec::new(),
318            Vec::new(),
319            runtime_errors,
320            paths_file_warnings,
321            None,
322            None,
323            license_engine,
324        )
325    };
326
327    progress.start_post_scan();
328
329    if cli.filter_clues {
330        progress.post_scan_step("Filtering redundant clues...");
331        let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
332            prepare_filter_clue_rule_lookup(
333                &scan_result.files,
334                active_license_engine.as_deref(),
335                cli.license_dataset_path.as_deref(),
336                shared_license_cache_config.as_ref(),
337            )
338        })?;
339        if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
340            filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
341        } else {
342            filter_redundant_clues(&mut scan_result.files);
343        }
344    }
345
346    if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
347        progress.post_scan_step("Applying ignore-resource filters...");
348        record_detail_timing(&progress, "post-scan:ignore-resource", || {
349            apply_ignore_resource_filter(
350                &mut scan_result.files,
351                &ignore_copyright_holder_patterns,
352                &ignore_author_patterns,
353            );
354        });
355    }
356
357    if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
358        progress.post_scan_step("Applying path selection filters...");
359        record_detail_timing(&progress, "output-filter:path-selection", || {
360            apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
361        });
362    }
363
364    if cli.only_findings {
365        progress.post_scan_step("Filtering to files with findings...");
366        record_detail_timing(&progress, "output-filter:only-findings", || {
367            apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
368        });
369    }
370
371    if cli.info && cli.mark_source {
372        progress.post_scan_step("Marking source files...");
373        record_detail_timing(&progress, "post-scan:mark-source", || {
374            apply_mark_source(&mut scan_result.files);
375        });
376    }
377
378    if should_include_info_surface(&scan_result.files, &cli) {
379        progress.post_scan_step("Populating info resource counts...");
380        record_detail_timing(&progress, "post-scan:info-resource-counts", || {
381            populate_info_resource_counts(&mut scan_result.files);
382        });
383    }
384
385    progress.post_scan_step("Backfilling license provenance...");
386    record_detail_timing(&progress, "post-scan:license-provenance", || {
387        for file in &mut scan_result.files {
388            file.backfill_license_provenance();
389        }
390    });
391
392    if cli.from_json {
393        for err in &preloaded_extra_errors {
394            progress.record_additional_error(err);
395        }
396    }
397
398    let mut extra_errors = preloaded_extra_errors;
399    if let Some(policy_path) = cli.license_policy.as_deref() {
400        progress.post_scan_step("Applying license policy...");
401        let license_policy_errors =
402            record_detail_timing(&progress, "post-scan:license-policy", || {
403                apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
404            })?;
405        for err in &license_policy_errors {
406            progress.record_additional_error(err);
407        }
408        extra_errors.extend(license_policy_errors);
409    }
410
411    if cli.from_json {
412        progress.post_scan_step("Trimming preloaded assembly to filtered files...");
413        record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
414            trim_preloaded_assembly_to_files(
415                &scan_result.files,
416                &mut preloaded_assembly.packages,
417                &mut preloaded_assembly.dependencies,
418            );
419        });
420    }
421
422    progress.finish_post_scan();
423
424    let manifests_seen = scan_result
425        .files
426        .iter()
427        .map(|file| file.package_data.len())
428        .sum();
429    let skip_assembly = cli.no_assemble || cli.package_only;
430
431    let mut assembly_result = if skip_assembly {
432        assembly::AssemblyResult {
433            packages: Vec::new(),
434            dependencies: Vec::new(),
435        }
436    } else {
437        progress.start_assembly();
438
439        let mut result = if cli.from_json
440            && (!preloaded_assembly.packages.is_empty()
441                || !preloaded_assembly.dependencies.is_empty())
442        {
443            progress.assembly_step("Using preloaded assembly...");
444            preloaded_assembly
445        } else {
446            assembly::assemble(&mut scan_result.files)
447        };
448
449        progress.assembly_step("Backfilling package license provenance...");
450        record_detail_timing(&progress, "assembly:package-license-provenance", || {
451            for package in &mut result.packages {
452                package.backfill_license_provenance();
453            }
454        });
455
456        progress.assembly_step("Applying package reference following...");
457        record_detail_timing(&progress, "assembly:package-reference-following", || {
458            apply_package_reference_following(&mut scan_result.files, &mut result.packages);
459        });
460
461        progress.finish_assembly(result.packages.len(), manifests_seen);
462        result
463    };
464
465    progress.start_finalize();
466
467    if !cli.from_json && (cli.strip_root || cli.full_root) {
468        let root_path = cli
469            .dir_path
470            .first()
471            .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
472        progress.finalize_step("Normalizing paths...");
473        record_detail_timing(&progress, "finalize:path-normalization", || {
474            normalize_paths(
475                &mut scan_result.files,
476                root_path,
477                cli.strip_root,
478                cli.full_root,
479            );
480            normalize_top_level_output_paths(
481                &mut assembly_result.packages,
482                &mut assembly_result.dependencies,
483                root_path,
484                cli.strip_root,
485            );
486        });
487    }
488
489    progress.finalize_step("Collecting license detections...");
490    let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
491        let preserve_preloaded_top_level_detections = cli.from_json
492            && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
493        collect_top_level_license_detections_for_mode(
494            &scan_result.files,
495            preloaded_license_detections,
496            preserve_preloaded_top_level_detections,
497            cli.from_json && cli.dir_path.len() > 1,
498        )
499    });
500
501    let should_recompute_license_references = cli.from_json
502        && (!preloaded_license_references.is_empty()
503            || !preloaded_license_rule_references.is_empty()
504            || cli.license_references
505            || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
506                && !preloaded_license_references.is_empty()));
507
508    if should_recompute_license_references && active_license_engine.is_none() {
509        progress.start_license_detection_engine_creation();
510        active_license_engine = Some(init_license_engine(
511            shared_cache_config
512                .as_ref()
513                .expect("cache config should be prepared before license engine init"),
514            &cli,
515        )?);
516        progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
517    }
518
519    progress.finalize_step("Collecting license references...");
520    let (license_references, license_rule_references) =
521        record_detail_timing(&progress, "finalize:license-references", || {
522            if cli.from_json && !should_recompute_license_references {
523                (
524                    preloaded_license_references,
525                    preloaded_license_rule_references,
526                )
527            } else if cli.license_references || should_recompute_license_references {
528                if let Some(engine) = active_license_engine.as_deref() {
529                    collect_top_level_license_references(
530                        &scan_result.files,
531                        &assembly_result.packages,
532                        engine.index(),
533                        &cli.license_url_template,
534                    )
535                } else {
536                    (Vec::new(), Vec::new())
537                }
538            } else {
539                (Vec::new(), Vec::new())
540            }
541        });
542
543    let end_time = Utc::now();
544    let spdx_license_list_version = active_license_engine
545        .as_ref()
546        .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
547        .or(imported_spdx_license_list_version)
548        .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
549    let license_index_provenance = active_license_engine
550        .as_ref()
551        .and_then(|engine| engine.license_index_provenance().cloned())
552        .or(imported_license_index_provenance);
553
554    progress.finalize_step("Preparing output...");
555    let output = record_detail_timing(&progress, "finalize:output-prepare", || {
556        create_output(
557            start_time,
558            end_time,
559            scan_result,
560            CreateOutputContext {
561                total_dirs,
562                assembly_result,
563                license_detections,
564                license_references,
565                license_rule_references,
566                spdx_license_list_version,
567                license_index_provenance,
568                extra_errors,
569                extra_warnings,
570                header_options: cli.output_header_options(),
571                options: CreateOutputOptions {
572                    facet_rules: &facet_rules,
573                    include_classify: cli.classify,
574                    include_summary: cli.summary,
575                    include_license_clarity_score: cli.license_clarity_score,
576                    include_tallies: cli.tallies,
577                    include_tallies_of_key_files: cli.tallies_key_files,
578                    include_tallies_with_details: cli.tallies_with_details,
579                    include_tallies_by_facet: cli.tallies_by_facet,
580                    include_generated: cli.generated,
581                    verbose: cli.verbose,
582                },
583            },
584        )
585    });
586    progress.finish_finalize();
587
588    let output_schema_output = crate::output_schema::Output::from(&output);
589    progress.start_output();
590    for target in cli.output_targets() {
591        let output_config = OutputWriteConfig {
592            format: target.format,
593            custom_template: target.custom_template.clone(),
594            scanned_path: if cli.dir_path.len() == 1 {
595                cli.dir_path.first().cloned()
596            } else {
597                None
598            },
599        };
600
601        let timing_name = format!("output:{:?}", target.format).to_lowercase();
602        record_detail_timing(&progress, timing_name, || {
603            write_output_file(&target.file, &output_schema_output, &output_config)
604        })?;
605        progress.output_written(&format!(
606            "{:?} output written to {}",
607            target.format, target.file
608        ));
609    }
610    progress.record_final_counts(&output.files);
611    progress.record_final_header_counts(&output.headers);
612    progress.finish_output();
613
614    let summary_end = Utc::now();
615    progress.display_summary(
616        &format_scancode_timestamp(&start_time),
617        &format_scancode_timestamp(&summary_end),
618    );
619
620    Ok(())
621}
622
623fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
624    if from_json {
625        files.clear();
626    } else {
627        apply_only_findings_filter(files);
628    }
629}
630
631fn collect_top_level_license_detections_for_mode(
632    files: &[FileInfo],
633    preloaded: Vec<crate::models::TopLevelLicenseDetection>,
634    preserve_preloaded: bool,
635    clear_for_multi_input_replay: bool,
636) -> Vec<crate::models::TopLevelLicenseDetection> {
637    if clear_for_multi_input_replay {
638        Vec::new()
639    } else if preserve_preloaded {
640        preloaded
641    } else {
642        collect_top_level_license_detections(files)
643    }
644}
645
646#[cfg(feature = "golden-tests")]
647fn touch_license_golden_symbols() {
648    let _ = crate::license_detection::golden_utils::read_golden_input_content;
649    let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
650    let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
651    let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
652}
653
654fn resolve_native_scan_selection(cli: &Cli) -> Result<(String, Vec<SelectedPath>, Vec<String>)> {
655    if cli.paths_file.is_empty() {
656        let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
657        return Ok((scan_path, selected_paths, Vec::new()));
658    }
659
660    let scan_path = cli
661        .dir_path
662        .first()
663        .cloned()
664        .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
665    let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
666    let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
667    if resolved.selections.is_empty() {
668        return Err(anyhow!(
669            "--paths-file did not resolve to any existing files or directories under {:?}",
670            Path::new(&scan_path)
671        ));
672    }
673
674    Ok((scan_path, resolved.selections, resolved.missing_entries))
675}
676
677fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
678    let mut entries = Vec::new();
679    for paths_file in paths_files {
680        let content = read_paths_file_content(paths_file)?;
681        entries.extend(content.lines().map(ToOwned::to_owned));
682    }
683    Ok(entries)
684}
685
686fn read_paths_file_content(paths_file: &str) -> Result<String> {
687    if paths_file == "-" {
688        let mut content = String::new();
689        std::io::stdin()
690            .read_to_string(&mut content)
691            .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
692        return Ok(content);
693    }
694
695    fs::read_to_string(paths_file)
696        .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
697}
698
699fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
700    missing_entries
701        .iter()
702        .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
703        .collect()
704}
705
706fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
707    if cli.show_attribution {
708        return Ok(());
709    }
710
711    if cli.export_license_dataset.is_some() {
712        if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() {
713            return Err(anyhow!(
714                "--export-license-dataset does not accept scan input paths or --paths-file"
715            ));
716        }
717
718        if cli.from_json
719            || cli.license
720            || cli.package
721            || cli.system_package
722            || cli.package_in_compiled
723            || cli.package_only
724            || cli.copyright
725            || cli.email
726            || cli.url
727            || cli.generated
728            || cli.info
729            || cli.incremental
730            || cli.reindex
731            || cli.no_license_index_cache
732            || cli.license_dataset_path.is_some()
733        {
734            return Err(anyhow!(
735                "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
736            ));
737        }
738
739        return Ok(());
740    }
741
742    if cli.from_json
743        && (cli.package
744            || cli.system_package
745            || cli.package_in_compiled
746            || cli.package_only
747            || cli.copyright
748            || cli.email
749            || cli.url
750            || cli.generated)
751    {
752        return Err(anyhow!(
753            "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
754        ));
755    }
756
757    if cli.from_json && !cli.paths_file.is_empty() {
758        return Err(anyhow!(
759            "--paths-file is only supported for native scan mode, not --from-json"
760        ));
761    }
762
763    if cli.from_json && cli.incremental {
764        return Err(anyhow!(
765            "--incremental is only supported for directory scan mode, not --from-json"
766        ));
767    }
768
769    if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
770        return Err(anyhow!(
771            "--paths-file requires exactly one positional scan root"
772        ));
773    }
774
775    if !cli.from_json && cli.dir_path.is_empty() {
776        return Err(anyhow!("Directory path is required for scan operations"));
777    }
778
779    if cli.tallies_by_facet && cli.facet.is_empty() {
780        return Err(anyhow!(
781            "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
782        ));
783    }
784
785    if cli.mark_source && !cli.info {
786        return Err(anyhow!("--mark-source requires --info"));
787    }
788
789    Ok(())
790}
791
792fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
793    let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
794    let config = CacheConfig::from_overrides(
795        scan_root,
796        cli.cache_dir.as_deref().map(Path::new),
797        env_cache_dir.as_deref(),
798        cli.incremental,
799    );
800
801    if cli.cache_clear {
802        crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
803            config.clear_contents()
804        })?;
805    }
806
807    if config.incremental_enabled() {
808        config.ensure_dirs()?;
809    }
810
811    Ok(config)
812}
813
814fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
815    LicenseCacheConfig::new(
816        cache_root.root_dir().to_path_buf(),
817        cli.reindex,
818        !cli.no_license_index_cache,
819    )
820}
821
822fn partition_incremental_files(
823    collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
824    scan_root: &Path,
825    manifest: Option<&IncrementalManifest>,
826) -> Vec<FileInfo> {
827    let Some(manifest) = manifest else {
828        return Vec::new();
829    };
830
831    let mut files_to_scan = Vec::new();
832    let mut reused_files = Vec::new();
833
834    for (path, metadata) in collected_files.drain(..) {
835        let relative_path = normalize_relative_scan_path(&path, scan_root);
836        let Some(entry) = manifest.entry(&relative_path) else {
837            files_to_scan.push((path, metadata));
838            continue;
839        };
840
841        match manifest_entry_matches_path(entry, &path, &metadata) {
842            Ok(true) => reused_files.push(entry.file_info.clone()),
843            Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
844        }
845    }
846
847    *collected_files = files_to_scan;
848    reused_files
849}
850
851fn merge_incremental_file_results(
852    processed_files: Vec<FileInfo>,
853    reused_files: Vec<FileInfo>,
854    ordered_file_paths: &[PathBuf],
855) -> Vec<FileInfo> {
856    let mut processed_file_entries = HashMap::new();
857    let mut directory_entries = Vec::new();
858    for file in processed_files {
859        if file.file_type == FileType::File {
860            processed_file_entries.insert(file.path.clone(), file);
861        } else {
862            directory_entries.push(file);
863        }
864    }
865
866    let mut reused_file_entries: HashMap<_, _> = reused_files
867        .into_iter()
868        .map(|file| (file.path.clone(), file))
869        .collect();
870
871    let mut merged_files = Vec::new();
872    for path in ordered_file_paths {
873        let path_string = path.to_string_lossy().to_string();
874        if let Some(file) = processed_file_entries.remove(&path_string) {
875            merged_files.push(file);
876            continue;
877        }
878
879        if let Some(file) = reused_file_entries.remove(&path_string) {
880            merged_files.push(file);
881        }
882    }
883
884    merged_files.extend(processed_file_entries.into_values());
885    merged_files.extend(reused_file_entries.into_values());
886    merged_files.extend(directory_entries);
887    merged_files
888}
889
890fn build_incremental_manifest(
891    scan_root: &Path,
892    collected_files: &[(PathBuf, fs::Metadata)],
893    files: &[FileInfo],
894    options_fingerprint: &str,
895) -> IncrementalManifest {
896    let files_by_relative_path: HashMap<_, _> = files
897        .iter()
898        .filter(|file| file.file_type == FileType::File)
899        .map(|file| {
900            (
901                normalize_relative_scan_path(Path::new(&file.path), scan_root),
902                file.clone(),
903            )
904        })
905        .collect();
906
907    let entries = collected_files
908        .iter()
909        .filter_map(|(path, metadata)| {
910            let relative_path = normalize_relative_scan_path(path, scan_root);
911            let state = metadata_fingerprint(metadata)?;
912            let file_info = files_by_relative_path.get(&relative_path)?.clone();
913            let content_sha256 = file_info.sha256.unwrap_or_else(|| {
914                fs::read(path)
915                    .map(|bytes| calculate_sha256(&bytes))
916                    .unwrap_or_else(|_| {
917                        Sha256Digest::from_hex(
918                            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
919                        )
920                        .unwrap()
921                    })
922            });
923            Some((
924                relative_path,
925                IncrementalManifestEntry {
926                    state,
927                    content_sha256,
928                    file_info,
929                },
930            ))
931        })
932        .collect::<BTreeMap<_, _>>();
933
934    IncrementalManifest::new(options_fingerprint.to_string(), entries)
935}
936
937fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
938    let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
939    calculate_sha256(
940        format!(
941            "{}\n{options_fingerprint}",
942            canonical_root.to_string_lossy()
943        )
944        .as_bytes(),
945    )
946    .as_hex()
947}
948
949fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
950    path.strip_prefix(scan_root)
951        .unwrap_or(path)
952        .to_string_lossy()
953        .replace('\\', "/")
954}
955
956fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
957    patterns
958        .iter()
959        .map(|pattern| {
960            Regex::new(pattern).map_err(|err| {
961                anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
962            })
963        })
964        .collect()
965}
966
967fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
968    match process_mode {
969        ProcessMode::SequentialWithoutTimeouts => 0.0,
970        ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
971    }
972}
973
974fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
975    match process_mode {
976        ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
977        ProcessMode::SequentialWithoutTimeouts => {
978            Some("Disabling multi-processing and multi-threading for debugging.")
979        }
980        ProcessMode::Parallel(_) => None,
981    }
982}
983
984fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
985    if cli.quiet {
986        ProgressMode::Quiet
987    } else if cli.verbose {
988        ProgressMode::Verbose
989    } else {
990        ProgressMode::Default
991    }
992}
993
994fn configured_scan_names(cli: &Cli) -> String {
995    let mut names = Vec::new();
996    if cli.license {
997        names.push("licenses");
998    }
999    if cli.info {
1000        names.push("info");
1001    }
1002    if cli.package {
1003        names.push("packages");
1004    }
1005    if (cli.system_package || cli.package_in_compiled || cli.package_only)
1006        && !names.contains(&"packages")
1007    {
1008        names.push("packages");
1009    }
1010    if cli.copyright {
1011        names.push("copyrights");
1012    }
1013    if cli.email {
1014        names.push("emails");
1015    }
1016    if cli.url {
1017        names.push("urls");
1018    }
1019    names.join(", ")
1020}
1021
1022fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
1023    cli.info
1024        || files.iter().any(|file| {
1025            file.date.is_some()
1026                || file.sha1.is_some()
1027                || file.md5.is_some()
1028                || file.sha256.is_some()
1029                || file.sha1_git.is_some()
1030                || file.mime_type.is_some()
1031                || file.file_type_label.is_some()
1032                || file.programming_language.is_some()
1033                || file.is_binary.is_some()
1034                || file.is_text.is_some()
1035                || file.is_archive.is_some()
1036                || file.is_media.is_some()
1037                || file.is_source.is_some()
1038                || file.is_script.is_some()
1039                || file.files_count.is_some()
1040                || file.dirs_count.is_some()
1041                || file.size_count.is_some()
1042        })
1043}
1044
1045fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1046where
1047    F: FnOnce() -> T,
1048{
1049    let started = Instant::now();
1050    let result = f();
1051    progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1052    result
1053}
1054
1055fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1056where
1057    F: FnOnce() -> Result<T> + Send,
1058    T: Send,
1059{
1060    let pool = rayon::ThreadPoolBuilder::new()
1061        .num_threads(threads.max(1))
1062        .build()?;
1063    pool.install(f)
1064}
1065
1066fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
1067    let cache_config = build_license_cache_config(cache_root, cli);
1068
1069    match &cli.license_dataset_path {
1070        Some(p) => {
1071            let path = PathBuf::from(p);
1072            if !path.exists() {
1073                return Err(anyhow!("License dataset path does not exist: {:?}", path));
1074            }
1075            let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1076            Ok(Arc::new(engine))
1077        }
1078        None => {
1079            let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1080            Ok(Arc::new(engine))
1081        }
1082    }
1083}
1084
1085fn describe_license_engine_source(
1086    engine: &LicenseDetectionEngine,
1087    rules_path: Option<&str>,
1088) -> String {
1089    match rules_path {
1090        Some(path) => format!(
1091            "License detection engine initialized with {} rules from custom dataset {}",
1092            engine.index().rules_by_rid.len(),
1093            path
1094        ),
1095        None => format!(
1096            "License detection engine initialized with {} rules from embedded artifact",
1097            engine.index().rules_by_rid.len()
1098        ),
1099    }
1100}
1101
1102#[cfg(test)]
1103mod tests;