Skip to main content

provenant/cli/run/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::assembly;
5use crate::cache::{
6    CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7    build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8    manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, Command, ProcessMode, ScanArgs};
11use crate::compare::compare_json_files;
12use crate::license_detection::LicenseDetectionEngine;
13use crate::license_detection::dataset::export_embedded_license_dataset;
14use crate::license_detection::license_cache::LicenseCacheConfig;
15use crate::models::{FileInfo, FileType, Sha256Digest};
16use crate::output::{OutputWriteConfig, write_output_file};
17use crate::post_processing::{
18    CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
19    apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
20    collect_top_level_license_detections, collect_top_level_license_references, create_output,
21};
22use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
23use crate::scan_result_shaping::{
24    SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
25    apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
26    filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
27    normalize_top_level_output_paths, populate_info_resource_counts,
28    prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
29    trim_preloaded_assembly_to_files,
30};
31use crate::scanner::{
32    CollectionFrontier, LicenseScanOptions, TextDetectionOptions, collect_paths,
33    collect_selected_paths, process_collected_with_memory_limit,
34    process_collected_with_memory_limit_sequential, scan_options_fingerprint,
35};
36use crate::time::format_scancode_timestamp;
37use crate::utils::hash::calculate_sha256;
38use anyhow::{Result, anyhow};
39use chrono::Utc;
40use regex::Regex;
41use std::collections::{BTreeMap, HashMap};
42use std::env;
43use std::fs;
44use std::io::Read;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use std::time::Instant;
48
49pub fn run() -> Result<()> {
50    #[cfg(feature = "golden-tests")]
51    touch_license_golden_symbols();
52
53    let cli = Cli::parse();
54    match &cli.command {
55        Command::ShowAttribution => {
56            print!("{}", include_str!("../../../NOTICE"));
57            return Ok(());
58        }
59        Command::Compare(args) => {
60            let result = compare_json_files(
61                &args.scancode_json,
62                &args.provenant_json,
63                args.artifact_dir.as_deref(),
64            )?;
65            println!("Comparison status: {}", result.comparison_status);
66            println!("Artifacts:");
67            println!("  Artifact directory: {}", result.artifact_dir.display());
68            println!("  Run manifest:       {}", result.manifest_path.display());
69            println!("  Raw ScanCode JSON:  {}", result.scancode_json.display());
70            println!("  Raw Provenant JSON: {}", result.provenant_json.display());
71            println!("  Summary JSON:       {}", result.summary_json.display());
72            println!("  Summary TSV:        {}", result.summary_tsv.display());
73            println!("  Sample artifacts:   {}", result.samples_dir.display());
74            return Ok(());
75        }
76        Command::ExportLicenseDataset(args) => {
77            export_embedded_license_dataset(Path::new(&args.dir))?;
78            return Ok(());
79        }
80        Command::Scan(_) => {}
81    }
82
83    let cli = cli
84        .scan_args()
85        .expect("scan arguments should exist after command dispatch");
86
87    validate_scan_option_compatibility(cli)?;
88
89    let start_time = Utc::now();
90    let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(cli)));
91    progress.set_processes(cli.processes);
92    progress.set_scan_names(configured_scan_names(cli));
93    progress.init_logging_bridge();
94    let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
95
96    progress.start_setup();
97    let facet_rules = build_facet_rules(&cli.facet)?;
98
99    let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
100    let ignore_copyright_holder_patterns =
101        compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
102    progress.finish_setup();
103
104    progress.start_discovery();
105
106    let mut shared_cache_config = if cli.from_json {
107        let cache_config = prepare_cache_config(None, cli)?;
108        shared_license_cache_config = Some(build_license_cache_config(&cache_config, cli));
109        Some(cache_config)
110    } else {
111        None
112    };
113
114    let (
115        mut scan_result,
116        total_dirs,
117        mut preloaded_assembly,
118        preloaded_license_detections,
119        preloaded_license_references,
120        preloaded_license_rule_references,
121        preloaded_extra_errors,
122        extra_warnings,
123        imported_spdx_license_list_version,
124        imported_license_index_provenance,
125        mut active_license_engine,
126    ) = if cli.from_json {
127        let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
128        let directories_count = loaded.directory_count();
129        let files_count = loaded.file_count();
130        let size_count = loaded.file_size_count();
131        progress.finish_discovery(
132            files_count,
133            directories_count,
134            size_count,
135            loaded.excluded_count,
136        );
137        let (
138            process_result,
139            assembly_result,
140            license_detections,
141            license_references,
142            license_rule_references,
143            extra_errors,
144            imported_spdx_license_list_version,
145            imported_license_index_provenance,
146        ) = loaded.into_parts()?;
147        (
148            process_result,
149            directories_count,
150            assembly_result,
151            license_detections,
152            license_references,
153            license_rule_references,
154            extra_errors,
155            Vec::new(),
156            imported_spdx_license_list_version,
157            imported_license_index_provenance,
158            None,
159        )
160    } else {
161        let NativeScanSelection {
162            scan_path,
163            selected_paths,
164            collection_frontier,
165            missing_entries: missing_paths_file_entries,
166        } = resolve_native_scan_selection(cli)?;
167        let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
168        for warning in &paths_file_warnings {
169            progress.output_written(warning);
170        }
171
172        let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), cli)?;
173        shared_license_cache_config = Some(build_license_cache_config(&cache_config, cli));
174        shared_cache_config = Some(cache_config.clone());
175        let collection_exclude_patterns =
176            build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
177
178        let mut collected = if cli.paths_file.is_empty() {
179            collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns)
180        } else {
181            collect_selected_paths(
182                Path::new(&scan_path),
183                &collection_frontier,
184                cli.max_depth,
185                &collection_exclude_patterns,
186            )
187        };
188        let user_excluded_count = apply_user_path_filters_to_collected(
189            &mut collected,
190            Path::new(&scan_path),
191            &selected_paths,
192            &cli.include,
193            &cli.exclude,
194        );
195        let total_files = collected.file_count();
196        let total_dirs = collected.directory_count();
197        let total_size = collected.total_file_bytes;
198        let excluded_count = collected.excluded_count + user_excluded_count;
199        let all_collected_files = collected.files.clone();
200        let ordered_file_paths: Vec<PathBuf> = collected
201            .files
202            .iter()
203            .map(|(path, _)| path.clone())
204            .collect();
205        let runtime_errors = collected
206            .collection_errors
207            .iter()
208            .map(|(path, err)| format_default_scan_error(path, err))
209            .collect();
210        for (path, err) in &collected.collection_errors {
211            progress.record_runtime_error(path, err);
212        }
213        progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
214        if !cli.quiet {
215            progress.output_written(&format!(
216                "Found {} files in {} directories ({} items excluded)",
217                total_files, total_dirs, excluded_count
218            ));
219        }
220
221        let license_engine = if cli.license {
222            progress.start_setup();
223            progress.start_license_detection_engine_creation();
224            let engine = init_license_engine(
225                shared_cache_config
226                    .as_ref()
227                    .expect("cache config should be prepared before license engine init"),
228                cli,
229            )?;
230            progress.finish_license_detection_engine_creation("setup_scan:licenses");
231            progress.finish_setup();
232            progress.output_written(&describe_license_engine_source(
233                &engine,
234                cli.license_dataset_path.as_deref(),
235            ));
236            Some(engine)
237        } else {
238            None
239        };
240
241        let enable_application_packages = cli.package || cli.package_only;
242        let enable_system_packages = cli.system_package || cli.package_only;
243        let enable_packages =
244            enable_application_packages || enable_system_packages || cli.package_in_compiled;
245        let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
246        {
247            (false, cli.email, cli.url, cli.generated)
248        } else {
249            (cli.copyright, cli.email, cli.url, cli.generated)
250        };
251        let process_mode = cli.processes;
252
253        let text_options = TextDetectionOptions {
254            collect_info: cli.info,
255            detect_packages: enable_packages,
256            detect_application_packages: enable_application_packages,
257            detect_system_packages: enable_system_packages,
258            detect_packages_in_compiled: cli.package_in_compiled,
259            detect_copyrights,
260            detect_generated,
261            detect_emails,
262            detect_urls,
263            max_emails: cli.max_email,
264            max_urls: cli.max_url,
265            timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
266        };
267
268        let license_options = LicenseScanOptions {
269            include_text: cli.license_text,
270            include_text_diagnostics: cli.license_text_diagnostics,
271            include_diagnostics: cli.license_diagnostics,
272            unknown_licenses: cli.unknown_licenses,
273            min_score: cli.license_score,
274        };
275        let options_fingerprint =
276            scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
277
278        if cli.incremental {
279            let manifest_path = incremental_manifest_path(
280                cache_config.root_dir(),
281                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
282            );
283            let previous_manifest =
284                load_incremental_manifest(&manifest_path, &options_fingerprint)?;
285            let reused_files = partition_incremental_files(
286                &mut collected.files,
287                Path::new(&scan_path),
288                previous_manifest.as_ref(),
289            );
290            progress.record_incremental_reused(reused_files.len());
291        }
292
293        if let Some(message) = process_mode_message(process_mode) {
294            progress.output_written(message);
295        }
296        progress.start_scan(collected.file_count());
297        let mut result = match process_mode {
298            ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
299                Ok(process_collected_with_memory_limit(
300                    &collected,
301                    Arc::clone(&progress),
302                    license_engine.clone(),
303                    license_options,
304                    &text_options,
305                    cli.max_in_memory,
306                ))
307            })?,
308            ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
309                process_collected_with_memory_limit_sequential(
310                    &collected,
311                    Arc::clone(&progress),
312                    license_engine.clone(),
313                    license_options,
314                    &text_options,
315                    cli.max_in_memory,
316                )
317            }
318        };
319
320        if cli.incremental {
321            let manifest_path = incremental_manifest_path(
322                cache_config.root_dir(),
323                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
324            );
325            let reused_files = partition_incremental_files(
326                &mut all_collected_files.clone(),
327                Path::new(&scan_path),
328                load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
329            );
330            result.files =
331                merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
332
333            let manifest = build_incremental_manifest(
334                Path::new(&scan_path),
335                &all_collected_files,
336                &result.files,
337                &options_fingerprint,
338            );
339            write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
340        }
341
342        result.excluded_count = excluded_count;
343        progress.finish_scan();
344
345        (
346            result,
347            total_dirs,
348            assembly::AssemblyResult {
349                packages: Vec::new(),
350                dependencies: Vec::new(),
351            },
352            Vec::new(),
353            Vec::new(),
354            Vec::new(),
355            runtime_errors,
356            paths_file_warnings,
357            None,
358            None,
359            license_engine,
360        )
361    };
362
363    progress.start_post_scan();
364
365    if cli.filter_clues {
366        progress.post_scan_step("Filtering redundant clues...");
367        let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
368            prepare_filter_clue_rule_lookup(
369                &scan_result.files,
370                active_license_engine.as_deref(),
371                cli.license_dataset_path.as_deref(),
372                shared_license_cache_config.as_ref(),
373            )
374        })?;
375        if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
376            filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
377        } else {
378            filter_redundant_clues(&mut scan_result.files);
379        }
380    }
381
382    if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
383        progress.post_scan_step("Applying ignore-resource filters...");
384        record_detail_timing(&progress, "post-scan:ignore-resource", || {
385            apply_ignore_resource_filter(
386                &mut scan_result.files,
387                &ignore_copyright_holder_patterns,
388                &ignore_author_patterns,
389            );
390        });
391    }
392
393    if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
394        progress.post_scan_step("Applying path selection filters...");
395        record_detail_timing(&progress, "output-filter:path-selection", || {
396            apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
397        });
398    }
399
400    if cli.only_findings {
401        progress.post_scan_step("Filtering to resources with findings...");
402        record_detail_timing(&progress, "output-filter:only-findings", || {
403            apply_only_findings_filter(&mut scan_result.files);
404        });
405    }
406
407    if cli.info && cli.mark_source {
408        progress.post_scan_step("Marking source files...");
409        record_detail_timing(&progress, "post-scan:mark-source", || {
410            apply_mark_source(&mut scan_result.files);
411        });
412    }
413
414    if should_include_info_surface(&scan_result.files, cli) {
415        progress.post_scan_step("Populating info resource counts...");
416        record_detail_timing(&progress, "post-scan:info-resource-counts", || {
417            populate_info_resource_counts(&mut scan_result.files);
418        });
419    }
420
421    progress.post_scan_step("Backfilling license provenance...");
422    record_detail_timing(&progress, "post-scan:license-provenance", || {
423        for file in &mut scan_result.files {
424            file.backfill_license_provenance();
425        }
426    });
427
428    if cli.from_json {
429        for err in &preloaded_extra_errors {
430            progress.record_additional_error(err);
431        }
432    }
433
434    let mut extra_errors = preloaded_extra_errors;
435    if let Some(policy_path) = cli.license_policy.as_deref() {
436        progress.post_scan_step("Applying license policy...");
437        let license_policy_errors =
438            record_detail_timing(&progress, "post-scan:license-policy", || {
439                apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
440            })?;
441        for err in &license_policy_errors {
442            progress.record_additional_error(err);
443        }
444        extra_errors.extend(license_policy_errors);
445    }
446
447    if cli.from_json {
448        progress.post_scan_step("Trimming preloaded assembly to filtered files...");
449        record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
450            trim_preloaded_assembly_to_files(
451                &scan_result.files,
452                &mut preloaded_assembly.packages,
453                &mut preloaded_assembly.dependencies,
454            );
455        });
456    }
457
458    progress.finish_post_scan();
459
460    let manifests_seen = scan_result
461        .files
462        .iter()
463        .map(|file| file.package_data.len())
464        .sum();
465    let skip_assembly = cli.no_assemble || cli.package_only;
466
467    let mut assembly_result = if skip_assembly {
468        assembly::AssemblyResult {
469            packages: Vec::new(),
470            dependencies: Vec::new(),
471        }
472    } else {
473        progress.start_assembly();
474
475        let mut result = if cli.from_json
476            && (!preloaded_assembly.packages.is_empty()
477                || !preloaded_assembly.dependencies.is_empty())
478        {
479            progress.assembly_step("Using preloaded assembly...");
480            preloaded_assembly
481        } else {
482            assembly::assemble(&mut scan_result.files)
483        };
484
485        progress.assembly_step("Backfilling package license provenance...");
486        record_detail_timing(&progress, "assembly:package-license-provenance", || {
487            for package in &mut result.packages {
488                package.backfill_license_provenance();
489            }
490        });
491
492        progress.assembly_step("Applying package reference following...");
493        record_detail_timing(&progress, "assembly:package-reference-following", || {
494            apply_package_reference_following(&mut scan_result.files, &mut result.packages);
495        });
496
497        progress.finish_assembly(result.packages.len(), manifests_seen);
498        result
499    };
500
501    progress.start_finalize();
502
503    if !cli.from_json && (cli.strip_root || cli.full_root) {
504        let root_path = cli
505            .dir_path
506            .first()
507            .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
508        progress.finalize_step("Normalizing paths...");
509        record_detail_timing(&progress, "finalize:path-normalization", || {
510            normalize_paths(
511                &mut scan_result.files,
512                root_path,
513                cli.strip_root,
514                cli.full_root,
515            );
516            normalize_top_level_output_paths(
517                &mut assembly_result.packages,
518                &mut assembly_result.dependencies,
519                root_path,
520                cli.strip_root,
521            );
522        });
523    }
524
525    progress.finalize_step("Collecting license detections...");
526    let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
527        let preserve_preloaded_top_level_detections = cli.from_json
528            && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
529        collect_top_level_license_detections_for_mode(
530            &scan_result.files,
531            preloaded_license_detections,
532            preserve_preloaded_top_level_detections,
533            cli.from_json && cli.dir_path.len() > 1,
534        )
535    });
536
537    let should_recompute_license_references = cli.from_json
538        && (!preloaded_license_references.is_empty()
539            || !preloaded_license_rule_references.is_empty()
540            || cli.license_references
541            || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
542                && !preloaded_license_references.is_empty()));
543
544    if should_recompute_license_references && active_license_engine.is_none() {
545        progress.start_license_detection_engine_creation();
546        active_license_engine = Some(init_license_engine(
547            shared_cache_config
548                .as_ref()
549                .expect("cache config should be prepared before license engine init"),
550            cli,
551        )?);
552        progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
553    }
554
555    progress.finalize_step("Collecting license references...");
556    let (license_references, license_rule_references) =
557        record_detail_timing(&progress, "finalize:license-references", || {
558            if cli.from_json && !should_recompute_license_references {
559                (
560                    preloaded_license_references,
561                    preloaded_license_rule_references,
562                )
563            } else if cli.license_references || should_recompute_license_references {
564                if let Some(engine) = active_license_engine.as_deref() {
565                    collect_top_level_license_references(
566                        &scan_result.files,
567                        &assembly_result.packages,
568                        engine.index(),
569                        &cli.license_url_template,
570                    )
571                } else {
572                    (Vec::new(), Vec::new())
573                }
574            } else {
575                (Vec::new(), Vec::new())
576            }
577        });
578
579    let end_time = Utc::now();
580    let spdx_license_list_version = active_license_engine
581        .as_ref()
582        .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
583        .or(imported_spdx_license_list_version)
584        .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
585    let license_index_provenance = active_license_engine
586        .as_ref()
587        .and_then(|engine| engine.license_index_provenance().cloned())
588        .or(imported_license_index_provenance);
589
590    progress.finalize_step("Preparing output...");
591    let output = record_detail_timing(&progress, "finalize:output-prepare", || {
592        create_output(
593            start_time,
594            end_time,
595            scan_result,
596            CreateOutputContext {
597                total_dirs,
598                assembly_result,
599                license_detections,
600                license_references,
601                license_rule_references,
602                spdx_license_list_version,
603                license_index_provenance,
604                extra_errors,
605                extra_warnings,
606                header_options: cli.output_header_options(),
607                options: CreateOutputOptions {
608                    facet_rules: &facet_rules,
609                    include_classify: cli.classify,
610                    include_summary: cli.summary,
611                    include_license_clarity_score: cli.license_clarity_score,
612                    include_tallies: cli.tallies,
613                    include_tallies_of_key_files: cli.tallies_key_files,
614                    include_tallies_with_details: cli.tallies_with_details,
615                    include_tallies_by_facet: cli.tallies_by_facet,
616                    include_generated: cli.generated,
617                    verbose: cli.verbose,
618                },
619            },
620        )
621    });
622    progress.finish_finalize();
623
624    let output_schema_output = crate::output_schema::Output::from(&output);
625    progress.start_output();
626    for target in cli.output_targets() {
627        let output_config = OutputWriteConfig {
628            format: target.format,
629            custom_template: target.custom_template.clone(),
630            scanned_path: if cli.dir_path.len() == 1 {
631                cli.dir_path.first().cloned()
632            } else {
633                None
634            },
635        };
636
637        let timing_name = format!("output:{:?}", target.format).to_lowercase();
638        record_detail_timing(&progress, timing_name, || {
639            write_output_file(&target.file, &output_schema_output, &output_config)
640        })?;
641        progress.output_written(&format!(
642            "{:?} output written to {}",
643            target.format, target.file
644        ));
645    }
646    progress.record_final_counts(&output.files);
647    progress.record_final_header_counts(&output.headers);
648    progress.finish_output();
649
650    let summary_end = Utc::now();
651    progress.display_summary(
652        &format_scancode_timestamp(&start_time),
653        &format_scancode_timestamp(&summary_end),
654    );
655
656    Ok(())
657}
658
659fn collect_top_level_license_detections_for_mode(
660    files: &[FileInfo],
661    preloaded: Vec<crate::models::TopLevelLicenseDetection>,
662    preserve_preloaded: bool,
663    clear_for_multi_input_replay: bool,
664) -> Vec<crate::models::TopLevelLicenseDetection> {
665    if clear_for_multi_input_replay {
666        Vec::new()
667    } else if preserve_preloaded {
668        preloaded
669    } else {
670        collect_top_level_license_detections(files)
671    }
672}
673
674#[cfg(feature = "golden-tests")]
675fn touch_license_golden_symbols() {
676    let _ = crate::license_detection::golden_utils::read_golden_input_content;
677    let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
678    let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
679    let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
680}
681
682#[derive(Debug)]
683struct NativeScanSelection {
684    scan_path: String,
685    selected_paths: Vec<SelectedPath>,
686    collection_frontier: Vec<CollectionFrontier>,
687    missing_entries: Vec<String>,
688}
689
690fn resolve_native_scan_selection(cli: &ScanArgs) -> Result<NativeScanSelection> {
691    if cli.paths_file.is_empty() {
692        let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
693        return Ok(NativeScanSelection {
694            scan_path,
695            selected_paths,
696            collection_frontier: Vec::new(),
697            missing_entries: Vec::new(),
698        });
699    }
700
701    let scan_path = cli
702        .dir_path
703        .first()
704        .cloned()
705        .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
706    let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
707    let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
708    if resolved.selections.is_empty() {
709        return Err(anyhow!(
710            "--paths-file did not resolve to any existing files or directories under {:?}",
711            Path::new(&scan_path)
712        ));
713    }
714
715    Ok(NativeScanSelection {
716        scan_path,
717        selected_paths: resolved.selections,
718        collection_frontier: resolved.frontier,
719        missing_entries: resolved.missing_entries,
720    })
721}
722
723fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
724    let mut entries = Vec::new();
725    for paths_file in paths_files {
726        let content = read_paths_file_content(paths_file)?;
727        entries.extend(content.lines().map(ToOwned::to_owned));
728    }
729    Ok(entries)
730}
731
732fn read_paths_file_content(paths_file: &str) -> Result<String> {
733    if paths_file == "-" {
734        let mut content = String::new();
735        std::io::stdin()
736            .read_to_string(&mut content)
737            .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
738        return Ok(content);
739    }
740
741    fs::read_to_string(paths_file)
742        .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
743}
744
745fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
746    missing_entries
747        .iter()
748        .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
749        .collect()
750}
751
752fn validate_scan_option_compatibility(cli: &ScanArgs) -> Result<()> {
753    if cli.from_json
754        && (cli.package
755            || cli.system_package
756            || cli.package_in_compiled
757            || cli.package_only
758            || cli.copyright
759            || cli.email
760            || cli.url
761            || cli.generated)
762    {
763        return Err(anyhow!(
764            "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
765        ));
766    }
767
768    if cli.from_json && !cli.paths_file.is_empty() {
769        return Err(anyhow!(
770            "--paths-file is only supported for native scan mode, not --from-json"
771        ));
772    }
773
774    if cli.from_json && cli.incremental {
775        return Err(anyhow!(
776            "--incremental is only supported for directory scan mode, not --from-json"
777        ));
778    }
779
780    if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
781        return Err(anyhow!(
782            "--paths-file requires exactly one positional scan root"
783        ));
784    }
785
786    if !cli.from_json && cli.dir_path.is_empty() {
787        return Err(anyhow!("Directory path is required for scan operations"));
788    }
789
790    if cli.tallies_by_facet && cli.facet.is_empty() {
791        return Err(anyhow!(
792            "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
793        ));
794    }
795
796    if cli.mark_source && !cli.info {
797        return Err(anyhow!("--mark-source requires --info"));
798    }
799
800    Ok(())
801}
802
803fn prepare_cache_config(scan_root: Option<&Path>, cli: &ScanArgs) -> Result<CacheConfig> {
804    let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
805    let config = CacheConfig::from_overrides(
806        scan_root,
807        cli.cache_dir.as_deref().map(Path::new),
808        env_cache_dir.as_deref(),
809        cli.incremental,
810    );
811
812    if cli.cache_clear {
813        crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
814            config.clear_contents()
815        })?;
816    }
817
818    if config.incremental_enabled() {
819        config.ensure_dirs()?;
820    }
821
822    Ok(config)
823}
824
825fn build_license_cache_config(cache_root: &CacheConfig, cli: &ScanArgs) -> LicenseCacheConfig {
826    LicenseCacheConfig::new(
827        cache_root.root_dir().to_path_buf(),
828        cli.reindex,
829        !cli.no_license_index_cache,
830    )
831}
832
833fn partition_incremental_files(
834    collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
835    scan_root: &Path,
836    manifest: Option<&IncrementalManifest>,
837) -> Vec<FileInfo> {
838    let Some(manifest) = manifest else {
839        return Vec::new();
840    };
841
842    let mut files_to_scan = Vec::new();
843    let mut reused_files = Vec::new();
844
845    for (path, metadata) in collected_files.drain(..) {
846        let relative_path = normalize_relative_scan_path(&path, scan_root);
847        let Some(entry) = manifest.entry(&relative_path) else {
848            files_to_scan.push((path, metadata));
849            continue;
850        };
851
852        match manifest_entry_matches_path(entry, &path, &metadata) {
853            Ok(true) => reused_files.push(entry.file_info.clone()),
854            Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
855        }
856    }
857
858    *collected_files = files_to_scan;
859    reused_files
860}
861
862fn merge_incremental_file_results(
863    processed_files: Vec<FileInfo>,
864    reused_files: Vec<FileInfo>,
865    ordered_file_paths: &[PathBuf],
866) -> Vec<FileInfo> {
867    let mut processed_file_entries = HashMap::new();
868    let mut directory_entries = Vec::new();
869    for file in processed_files {
870        if file.file_type == FileType::File {
871            processed_file_entries.insert(file.path.clone(), file);
872        } else {
873            directory_entries.push(file);
874        }
875    }
876
877    let mut reused_file_entries: HashMap<_, _> = reused_files
878        .into_iter()
879        .map(|file| (file.path.clone(), file))
880        .collect();
881
882    let mut merged_files = Vec::new();
883    for path in ordered_file_paths {
884        let path_string = path.to_string_lossy().to_string();
885        if let Some(file) = processed_file_entries.remove(&path_string) {
886            merged_files.push(file);
887            continue;
888        }
889
890        if let Some(file) = reused_file_entries.remove(&path_string) {
891            merged_files.push(file);
892        }
893    }
894
895    merged_files.extend(processed_file_entries.into_values());
896    merged_files.extend(reused_file_entries.into_values());
897    merged_files.extend(directory_entries);
898    merged_files
899}
900
901fn build_incremental_manifest(
902    scan_root: &Path,
903    collected_files: &[(PathBuf, fs::Metadata)],
904    files: &[FileInfo],
905    options_fingerprint: &str,
906) -> IncrementalManifest {
907    let files_by_relative_path: HashMap<_, _> = files
908        .iter()
909        .filter(|file| file.file_type == FileType::File)
910        .map(|file| {
911            (
912                normalize_relative_scan_path(Path::new(&file.path), scan_root),
913                file.clone(),
914            )
915        })
916        .collect();
917
918    let entries = collected_files
919        .iter()
920        .filter_map(|(path, metadata)| {
921            let relative_path = normalize_relative_scan_path(path, scan_root);
922            let state = metadata_fingerprint(metadata)?;
923            let file_info = files_by_relative_path.get(&relative_path)?.clone();
924            let content_sha256 = file_info.sha256.unwrap_or_else(|| {
925                fs::read(path)
926                    .map(|bytes| calculate_sha256(&bytes))
927                    .unwrap_or_else(|_| {
928                        Sha256Digest::from_hex(
929                            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
930                        )
931                        .unwrap()
932                    })
933            });
934            Some((
935                relative_path,
936                IncrementalManifestEntry {
937                    state,
938                    content_sha256,
939                    file_info,
940                },
941            ))
942        })
943        .collect::<BTreeMap<_, _>>();
944
945    IncrementalManifest::new(options_fingerprint.to_string(), entries)
946}
947
948fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
949    let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
950    calculate_sha256(
951        format!(
952            "{}\n{options_fingerprint}",
953            canonical_root.to_string_lossy()
954        )
955        .as_bytes(),
956    )
957    .as_hex()
958}
959
960fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
961    path.strip_prefix(scan_root)
962        .unwrap_or(path)
963        .to_string_lossy()
964        .replace('\\', "/")
965}
966
967fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
968    patterns
969        .iter()
970        .map(|pattern| {
971            Regex::new(pattern).map_err(|err| {
972                anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
973            })
974        })
975        .collect()
976}
977
978fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
979    match process_mode {
980        ProcessMode::SequentialWithoutTimeouts => 0.0,
981        ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
982    }
983}
984
985fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
986    match process_mode {
987        ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
988        ProcessMode::SequentialWithoutTimeouts => {
989            Some("Disabling multi-processing and multi-threading for debugging.")
990        }
991        ProcessMode::Parallel(_) => None,
992    }
993}
994
995fn progress_mode_from_cli(cli: &ScanArgs) -> ProgressMode {
996    if cli.quiet {
997        ProgressMode::Quiet
998    } else if cli.verbose {
999        ProgressMode::Verbose
1000    } else {
1001        ProgressMode::Default
1002    }
1003}
1004
1005fn configured_scan_names(cli: &ScanArgs) -> String {
1006    let mut names = Vec::new();
1007    if cli.license {
1008        names.push("licenses");
1009    }
1010    if cli.info {
1011        names.push("info");
1012    }
1013    if cli.package {
1014        names.push("packages");
1015    }
1016    if (cli.system_package || cli.package_in_compiled || cli.package_only)
1017        && !names.contains(&"packages")
1018    {
1019        names.push("packages");
1020    }
1021    if cli.copyright {
1022        names.push("copyrights");
1023    }
1024    if cli.email {
1025        names.push("emails");
1026    }
1027    if cli.url {
1028        names.push("urls");
1029    }
1030    names.join(", ")
1031}
1032
1033fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &ScanArgs) -> bool {
1034    cli.info
1035        || files.iter().any(|file| {
1036            file.date.is_some()
1037                || file.sha1.is_some()
1038                || file.md5.is_some()
1039                || file.sha256.is_some()
1040                || file.sha1_git.is_some()
1041                || file.mime_type.is_some()
1042                || file.file_type_label.is_some()
1043                || file.programming_language.is_some()
1044                || file.is_binary.is_some()
1045                || file.is_text.is_some()
1046                || file.is_archive.is_some()
1047                || file.is_media.is_some()
1048                || file.is_source.is_some()
1049                || file.is_script.is_some()
1050                || file.files_count.is_some()
1051                || file.dirs_count.is_some()
1052                || file.size_count.is_some()
1053        })
1054}
1055
1056fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1057where
1058    F: FnOnce() -> T,
1059{
1060    let started = Instant::now();
1061    let result = f();
1062    progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1063    result
1064}
1065
1066fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1067where
1068    F: FnOnce() -> Result<T> + Send,
1069    T: Send,
1070{
1071    let pool = rayon::ThreadPoolBuilder::new()
1072        .num_threads(threads.max(1))
1073        .build()?;
1074    pool.install(f)
1075}
1076
1077fn init_license_engine(
1078    cache_root: &CacheConfig,
1079    cli: &ScanArgs,
1080) -> Result<Arc<LicenseDetectionEngine>> {
1081    let cache_config = build_license_cache_config(cache_root, cli);
1082
1083    match &cli.license_dataset_path {
1084        Some(p) => {
1085            let path = PathBuf::from(p);
1086            if !path.exists() {
1087                return Err(anyhow!("License dataset path does not exist: {:?}", path));
1088            }
1089            let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1090            Ok(Arc::new(engine))
1091        }
1092        None => {
1093            let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1094            Ok(Arc::new(engine))
1095        }
1096    }
1097}
1098
1099fn describe_license_engine_source(
1100    engine: &LicenseDetectionEngine,
1101    rules_path: Option<&str>,
1102) -> String {
1103    match rules_path {
1104        Some(path) => format!(
1105            "License detection engine initialized with {} rules from custom dataset {}",
1106            engine.index().rules_by_rid.len(),
1107            path
1108        ),
1109        None => format!(
1110            "License detection engine initialized with {} rules from embedded artifact",
1111            engine.index().rules_by_rid.len()
1112        ),
1113    }
1114}
1115
1116#[cfg(test)]
1117mod tests;