Skip to main content

provenant/cli/run/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::assembly;
5use crate::cache::{
6    CACHE_DIR_ENV_VAR, CacheConfig, IncrementalManifest, IncrementalManifestEntry,
7    build_collection_exclude_patterns, incremental_manifest_path, load_incremental_manifest,
8    manifest_entry_matches_path, metadata_fingerprint, write_incremental_manifest,
9};
10use crate::cli::{Cli, ProcessMode};
11use crate::license_detection::LicenseDetectionEngine;
12use crate::license_detection::dataset::export_embedded_license_dataset;
13use crate::license_detection::license_cache::LicenseCacheConfig;
14use crate::models::{FileInfo, FileType, Sha256Digest};
15use crate::output::{OutputWriteConfig, write_output_file};
16use crate::post_processing::{
17    CreateOutputContext, CreateOutputOptions, DEFAULT_LICENSEDB_URL_TEMPLATE,
18    apply_license_policy_from_file, apply_package_reference_following, build_facet_rules,
19    collect_top_level_license_detections, collect_top_level_license_references, create_output,
20};
21use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error};
22use crate::scan_result_shaping::{
23    SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source,
24    apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues,
25    filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths,
26    normalize_top_level_output_paths, populate_info_resource_counts,
27    prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries,
28    trim_preloaded_assembly_to_files,
29};
30use crate::scanner::{
31    CollectionFrontier, LicenseScanOptions, TextDetectionOptions, collect_paths,
32    collect_selected_paths, process_collected_with_memory_limit,
33    process_collected_with_memory_limit_sequential, scan_options_fingerprint,
34};
35use crate::time::format_scancode_timestamp;
36use crate::utils::hash::calculate_sha256;
37use anyhow::{Result, anyhow};
38use chrono::Utc;
39use clap::Parser;
40use regex::Regex;
41use std::collections::{BTreeMap, HashMap};
42use std::env;
43use std::fs;
44use std::io::Read;
45use std::path::{Path, PathBuf};
46use std::sync::Arc;
47use std::time::Instant;
48
49pub fn run() -> Result<()> {
50    #[cfg(feature = "golden-tests")]
51    touch_license_golden_symbols();
52
53    let cli = Cli::parse();
54
55    validate_scan_option_compatibility(&cli)?;
56
57    if cli.show_attribution {
58        print!("{}", include_str!("../../../NOTICE"));
59        return Ok(());
60    }
61
62    if let Some(export_dir) = cli.export_license_dataset.as_deref() {
63        export_embedded_license_dataset(Path::new(export_dir))?;
64        return Ok(());
65    }
66
67    let start_time = Utc::now();
68    let progress = Arc::new(ScanProgress::new(progress_mode_from_cli(&cli)));
69    progress.set_processes(cli.processes);
70    progress.set_scan_names(configured_scan_names(&cli));
71    progress.init_logging_bridge();
72    let mut shared_license_cache_config: Option<LicenseCacheConfig> = None;
73
74    progress.start_setup();
75    let facet_rules = build_facet_rules(&cli.facet)?;
76
77    let ignore_author_patterns = compile_regex_patterns("--ignore-author", &cli.ignore_author)?;
78    let ignore_copyright_holder_patterns =
79        compile_regex_patterns("--ignore-copyright-holder", &cli.ignore_copyright_holder)?;
80    progress.finish_setup();
81
82    progress.start_discovery();
83
84    let mut shared_cache_config = if cli.from_json {
85        let cache_config = prepare_cache_config(None, &cli)?;
86        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
87        Some(cache_config)
88    } else {
89        None
90    };
91
92    let (
93        mut scan_result,
94        total_dirs,
95        mut preloaded_assembly,
96        preloaded_license_detections,
97        preloaded_license_references,
98        preloaded_license_rule_references,
99        preloaded_extra_errors,
100        extra_warnings,
101        imported_spdx_license_list_version,
102        imported_license_index_provenance,
103        mut active_license_engine,
104    ) = if cli.from_json {
105        let loaded = load_and_merge_json_inputs(&cli.dir_path, cli.strip_root, cli.full_root)?;
106        let directories_count = loaded.directory_count();
107        let files_count = loaded.file_count();
108        let size_count = loaded.file_size_count();
109        progress.finish_discovery(
110            files_count,
111            directories_count,
112            size_count,
113            loaded.excluded_count,
114        );
115        let (
116            process_result,
117            assembly_result,
118            license_detections,
119            license_references,
120            license_rule_references,
121            extra_errors,
122            imported_spdx_license_list_version,
123            imported_license_index_provenance,
124        ) = loaded.into_parts()?;
125        (
126            process_result,
127            directories_count,
128            assembly_result,
129            license_detections,
130            license_references,
131            license_rule_references,
132            extra_errors,
133            Vec::new(),
134            imported_spdx_license_list_version,
135            imported_license_index_provenance,
136            None,
137        )
138    } else {
139        let NativeScanSelection {
140            scan_path,
141            selected_paths,
142            collection_frontier,
143            missing_entries: missing_paths_file_entries,
144        } = resolve_native_scan_selection(&cli)?;
145        let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries);
146        for warning in &paths_file_warnings {
147            progress.output_written(warning);
148        }
149
150        let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?;
151        shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli));
152        shared_cache_config = Some(cache_config.clone());
153        let collection_exclude_patterns =
154            build_collection_exclude_patterns(Path::new(&scan_path), cache_config.root_dir());
155
156        let mut collected = if cli.paths_file.is_empty() {
157            collect_paths(&scan_path, cli.max_depth, &collection_exclude_patterns)
158        } else {
159            collect_selected_paths(
160                Path::new(&scan_path),
161                &collection_frontier,
162                cli.max_depth,
163                &collection_exclude_patterns,
164            )
165        };
166        let user_excluded_count = apply_user_path_filters_to_collected(
167            &mut collected,
168            Path::new(&scan_path),
169            &selected_paths,
170            &cli.include,
171            &cli.exclude,
172        );
173        let total_files = collected.file_count();
174        let total_dirs = collected.directory_count();
175        let total_size = collected.total_file_bytes;
176        let excluded_count = collected.excluded_count + user_excluded_count;
177        let all_collected_files = collected.files.clone();
178        let ordered_file_paths: Vec<PathBuf> = collected
179            .files
180            .iter()
181            .map(|(path, _)| path.clone())
182            .collect();
183        let runtime_errors = collected
184            .collection_errors
185            .iter()
186            .map(|(path, err)| format_default_scan_error(path, err))
187            .collect();
188        for (path, err) in &collected.collection_errors {
189            progress.record_runtime_error(path, err);
190        }
191        progress.finish_discovery(total_files, total_dirs, total_size, excluded_count);
192        if !cli.quiet {
193            progress.output_written(&format!(
194                "Found {} files in {} directories ({} items excluded)",
195                total_files, total_dirs, excluded_count
196            ));
197        }
198
199        let license_engine = if cli.license {
200            progress.start_setup();
201            progress.start_license_detection_engine_creation();
202            let engine = init_license_engine(
203                shared_cache_config
204                    .as_ref()
205                    .expect("cache config should be prepared before license engine init"),
206                &cli,
207            )?;
208            progress.finish_license_detection_engine_creation("setup_scan:licenses");
209            progress.finish_setup();
210            progress.output_written(&describe_license_engine_source(
211                &engine,
212                cli.license_dataset_path.as_deref(),
213            ));
214            Some(engine)
215        } else {
216            None
217        };
218
219        let enable_application_packages = cli.package || cli.package_only;
220        let enable_system_packages = cli.system_package || cli.package_only;
221        let enable_packages =
222            enable_application_packages || enable_system_packages || cli.package_in_compiled;
223        let (detect_copyrights, detect_emails, detect_urls, detect_generated) = if cli.package_only
224        {
225            (false, cli.email, cli.url, cli.generated)
226        } else {
227            (cli.copyright, cli.email, cli.url, cli.generated)
228        };
229        let process_mode = cli.processes;
230
231        let text_options = TextDetectionOptions {
232            collect_info: cli.info,
233            detect_packages: enable_packages,
234            detect_application_packages: enable_application_packages,
235            detect_system_packages: enable_system_packages,
236            detect_packages_in_compiled: cli.package_in_compiled,
237            detect_copyrights,
238            detect_generated,
239            detect_emails,
240            detect_urls,
241            max_emails: cli.max_email,
242            max_urls: cli.max_url,
243            timeout_seconds: effective_timeout_seconds(process_mode, cli.timeout),
244        };
245
246        let license_options = LicenseScanOptions {
247            include_text: cli.license_text,
248            include_text_diagnostics: cli.license_text_diagnostics,
249            include_diagnostics: cli.license_diagnostics,
250            unknown_licenses: cli.unknown_licenses,
251            min_score: cli.license_score,
252        };
253        let options_fingerprint =
254            scan_options_fingerprint(&text_options, license_options, license_engine.as_deref());
255
256        if cli.incremental {
257            let manifest_path = incremental_manifest_path(
258                cache_config.root_dir(),
259                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
260            );
261            let previous_manifest =
262                load_incremental_manifest(&manifest_path, &options_fingerprint)?;
263            let reused_files = partition_incremental_files(
264                &mut collected.files,
265                Path::new(&scan_path),
266                previous_manifest.as_ref(),
267            );
268            progress.record_incremental_reused(reused_files.len());
269        }
270
271        if let Some(message) = process_mode_message(process_mode) {
272            progress.output_written(message);
273        }
274        progress.start_scan(collected.file_count());
275        let mut result = match process_mode {
276            ProcessMode::Parallel(thread_count) => run_with_thread_pool(thread_count, || {
277                Ok(process_collected_with_memory_limit(
278                    &collected,
279                    Arc::clone(&progress),
280                    license_engine.clone(),
281                    license_options,
282                    &text_options,
283                    cli.max_in_memory,
284                ))
285            })?,
286            ProcessMode::SequentialWithTimeouts | ProcessMode::SequentialWithoutTimeouts => {
287                process_collected_with_memory_limit_sequential(
288                    &collected,
289                    Arc::clone(&progress),
290                    license_engine.clone(),
291                    license_options,
292                    &text_options,
293                    cli.max_in_memory,
294                )
295            }
296        };
297
298        if cli.incremental {
299            let manifest_path = incremental_manifest_path(
300                cache_config.root_dir(),
301                &incremental_manifest_key(Path::new(&scan_path), &options_fingerprint),
302            );
303            let reused_files = partition_incremental_files(
304                &mut all_collected_files.clone(),
305                Path::new(&scan_path),
306                load_incremental_manifest(&manifest_path, &options_fingerprint)?.as_ref(),
307            );
308            result.files =
309                merge_incremental_file_results(result.files, reused_files, &ordered_file_paths);
310
311            let manifest = build_incremental_manifest(
312                Path::new(&scan_path),
313                &all_collected_files,
314                &result.files,
315                &options_fingerprint,
316            );
317            write_incremental_manifest(cache_config.root_dir(), &manifest_path, &manifest)?;
318        }
319
320        result.excluded_count = excluded_count;
321        progress.finish_scan();
322
323        (
324            result,
325            total_dirs,
326            assembly::AssemblyResult {
327                packages: Vec::new(),
328                dependencies: Vec::new(),
329            },
330            Vec::new(),
331            Vec::new(),
332            Vec::new(),
333            runtime_errors,
334            paths_file_warnings,
335            None,
336            None,
337            license_engine,
338        )
339    };
340
341    progress.start_post_scan();
342
343    if cli.filter_clues {
344        progress.post_scan_step("Filtering redundant clues...");
345        let clue_rule_lookup = record_detail_timing(&progress, "post-scan:filter-clues", || {
346            prepare_filter_clue_rule_lookup(
347                &scan_result.files,
348                active_license_engine.as_deref(),
349                cli.license_dataset_path.as_deref(),
350                shared_license_cache_config.as_ref(),
351            )
352        })?;
353        if let Some(clue_rule_lookup) = clue_rule_lookup.as_ref() {
354            filter_redundant_clues_with_rules(&mut scan_result.files, Some(clue_rule_lookup));
355        } else {
356            filter_redundant_clues(&mut scan_result.files);
357        }
358    }
359
360    if !ignore_author_patterns.is_empty() || !ignore_copyright_holder_patterns.is_empty() {
361        progress.post_scan_step("Applying ignore-resource filters...");
362        record_detail_timing(&progress, "post-scan:ignore-resource", || {
363            apply_ignore_resource_filter(
364                &mut scan_result.files,
365                &ignore_copyright_holder_patterns,
366                &ignore_author_patterns,
367            );
368        });
369    }
370
371    if cli.from_json && (!cli.include.is_empty() || !cli.exclude.is_empty()) {
372        progress.post_scan_step("Applying path selection filters...");
373        record_detail_timing(&progress, "output-filter:path-selection", || {
374            apply_cli_path_selection_filter(&mut scan_result.files, &cli.include, &cli.exclude);
375        });
376    }
377
378    if cli.only_findings {
379        progress.post_scan_step("Filtering to files with findings...");
380        record_detail_timing(&progress, "output-filter:only-findings", || {
381            apply_only_findings_for_mode(&mut scan_result.files, cli.from_json);
382        });
383    }
384
385    if cli.info && cli.mark_source {
386        progress.post_scan_step("Marking source files...");
387        record_detail_timing(&progress, "post-scan:mark-source", || {
388            apply_mark_source(&mut scan_result.files);
389        });
390    }
391
392    if should_include_info_surface(&scan_result.files, &cli) {
393        progress.post_scan_step("Populating info resource counts...");
394        record_detail_timing(&progress, "post-scan:info-resource-counts", || {
395            populate_info_resource_counts(&mut scan_result.files);
396        });
397    }
398
399    progress.post_scan_step("Backfilling license provenance...");
400    record_detail_timing(&progress, "post-scan:license-provenance", || {
401        for file in &mut scan_result.files {
402            file.backfill_license_provenance();
403        }
404    });
405
406    if cli.from_json {
407        for err in &preloaded_extra_errors {
408            progress.record_additional_error(err);
409        }
410    }
411
412    let mut extra_errors = preloaded_extra_errors;
413    if let Some(policy_path) = cli.license_policy.as_deref() {
414        progress.post_scan_step("Applying license policy...");
415        let license_policy_errors =
416            record_detail_timing(&progress, "post-scan:license-policy", || {
417                apply_license_policy_from_file(&mut scan_result.files, Path::new(policy_path))
418            })?;
419        for err in &license_policy_errors {
420            progress.record_additional_error(err);
421        }
422        extra_errors.extend(license_policy_errors);
423    }
424
425    if cli.from_json {
426        progress.post_scan_step("Trimming preloaded assembly to filtered files...");
427        record_detail_timing(&progress, "post-scan:trim-preloaded-assembly", || {
428            trim_preloaded_assembly_to_files(
429                &scan_result.files,
430                &mut preloaded_assembly.packages,
431                &mut preloaded_assembly.dependencies,
432            );
433        });
434    }
435
436    progress.finish_post_scan();
437
438    let manifests_seen = scan_result
439        .files
440        .iter()
441        .map(|file| file.package_data.len())
442        .sum();
443    let skip_assembly = cli.no_assemble || cli.package_only;
444
445    let mut assembly_result = if skip_assembly {
446        assembly::AssemblyResult {
447            packages: Vec::new(),
448            dependencies: Vec::new(),
449        }
450    } else {
451        progress.start_assembly();
452
453        let mut result = if cli.from_json
454            && (!preloaded_assembly.packages.is_empty()
455                || !preloaded_assembly.dependencies.is_empty())
456        {
457            progress.assembly_step("Using preloaded assembly...");
458            preloaded_assembly
459        } else {
460            assembly::assemble(&mut scan_result.files)
461        };
462
463        progress.assembly_step("Backfilling package license provenance...");
464        record_detail_timing(&progress, "assembly:package-license-provenance", || {
465            for package in &mut result.packages {
466                package.backfill_license_provenance();
467            }
468        });
469
470        progress.assembly_step("Applying package reference following...");
471        record_detail_timing(&progress, "assembly:package-reference-following", || {
472            apply_package_reference_following(&mut scan_result.files, &mut result.packages);
473        });
474
475        progress.finish_assembly(result.packages.len(), manifests_seen);
476        result
477    };
478
479    progress.start_finalize();
480
481    if !cli.from_json && (cli.strip_root || cli.full_root) {
482        let root_path = cli
483            .dir_path
484            .first()
485            .ok_or_else(|| anyhow!("No input path available for path normalization"))?;
486        progress.finalize_step("Normalizing paths...");
487        record_detail_timing(&progress, "finalize:path-normalization", || {
488            normalize_paths(
489                &mut scan_result.files,
490                root_path,
491                cli.strip_root,
492                cli.full_root,
493            );
494            normalize_top_level_output_paths(
495                &mut assembly_result.packages,
496                &mut assembly_result.dependencies,
497                root_path,
498                cli.strip_root,
499            );
500        });
501    }
502
503    progress.finalize_step("Collecting license detections...");
504    let license_detections = record_detail_timing(&progress, "finalize:license-detections", || {
505        let preserve_preloaded_top_level_detections = cli.from_json
506            && (cli.only_findings || !cli.include.is_empty() || !cli.exclude.is_empty());
507        collect_top_level_license_detections_for_mode(
508            &scan_result.files,
509            preloaded_license_detections,
510            preserve_preloaded_top_level_detections,
511            cli.from_json && cli.dir_path.len() > 1,
512        )
513    });
514
515    let should_recompute_license_references = cli.from_json
516        && (!preloaded_license_references.is_empty()
517            || !preloaded_license_rule_references.is_empty()
518            || cli.license_references
519            || (cli.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE
520                && !preloaded_license_references.is_empty()));
521
522    if should_recompute_license_references && active_license_engine.is_none() {
523        progress.start_license_detection_engine_creation();
524        active_license_engine = Some(init_license_engine(
525            shared_cache_config
526                .as_ref()
527                .expect("cache config should be prepared before license engine init"),
528            &cli,
529        )?);
530        progress.finish_license_detection_engine_creation("finalize:license-engine-creation");
531    }
532
533    progress.finalize_step("Collecting license references...");
534    let (license_references, license_rule_references) =
535        record_detail_timing(&progress, "finalize:license-references", || {
536            if cli.from_json && !should_recompute_license_references {
537                (
538                    preloaded_license_references,
539                    preloaded_license_rule_references,
540                )
541            } else if cli.license_references || should_recompute_license_references {
542                if let Some(engine) = active_license_engine.as_deref() {
543                    collect_top_level_license_references(
544                        &scan_result.files,
545                        &assembly_result.packages,
546                        engine.index(),
547                        &cli.license_url_template,
548                    )
549                } else {
550                    (Vec::new(), Vec::new())
551                }
552            } else {
553                (Vec::new(), Vec::new())
554            }
555        });
556
557    let end_time = Utc::now();
558    let spdx_license_list_version = active_license_engine
559        .as_ref()
560        .and_then(|engine| engine.spdx_license_list_version().map(ToOwned::to_owned))
561        .or(imported_spdx_license_list_version)
562        .unwrap_or(LicenseDetectionEngine::embedded_spdx_license_list_version()?);
563    let license_index_provenance = active_license_engine
564        .as_ref()
565        .and_then(|engine| engine.license_index_provenance().cloned())
566        .or(imported_license_index_provenance);
567
568    progress.finalize_step("Preparing output...");
569    let output = record_detail_timing(&progress, "finalize:output-prepare", || {
570        create_output(
571            start_time,
572            end_time,
573            scan_result,
574            CreateOutputContext {
575                total_dirs,
576                assembly_result,
577                license_detections,
578                license_references,
579                license_rule_references,
580                spdx_license_list_version,
581                license_index_provenance,
582                extra_errors,
583                extra_warnings,
584                header_options: cli.output_header_options(),
585                options: CreateOutputOptions {
586                    facet_rules: &facet_rules,
587                    include_classify: cli.classify,
588                    include_summary: cli.summary,
589                    include_license_clarity_score: cli.license_clarity_score,
590                    include_tallies: cli.tallies,
591                    include_tallies_of_key_files: cli.tallies_key_files,
592                    include_tallies_with_details: cli.tallies_with_details,
593                    include_tallies_by_facet: cli.tallies_by_facet,
594                    include_generated: cli.generated,
595                    verbose: cli.verbose,
596                },
597            },
598        )
599    });
600    progress.finish_finalize();
601
602    let output_schema_output = crate::output_schema::Output::from(&output);
603    progress.start_output();
604    for target in cli.output_targets() {
605        let output_config = OutputWriteConfig {
606            format: target.format,
607            custom_template: target.custom_template.clone(),
608            scanned_path: if cli.dir_path.len() == 1 {
609                cli.dir_path.first().cloned()
610            } else {
611                None
612            },
613        };
614
615        let timing_name = format!("output:{:?}", target.format).to_lowercase();
616        record_detail_timing(&progress, timing_name, || {
617            write_output_file(&target.file, &output_schema_output, &output_config)
618        })?;
619        progress.output_written(&format!(
620            "{:?} output written to {}",
621            target.format, target.file
622        ));
623    }
624    progress.record_final_counts(&output.files);
625    progress.record_final_header_counts(&output.headers);
626    progress.finish_output();
627
628    let summary_end = Utc::now();
629    progress.display_summary(
630        &format_scancode_timestamp(&start_time),
631        &format_scancode_timestamp(&summary_end),
632    );
633
634    Ok(())
635}
636
637fn apply_only_findings_for_mode(files: &mut Vec<FileInfo>, from_json: bool) {
638    if from_json {
639        files.clear();
640    } else {
641        apply_only_findings_filter(files);
642    }
643}
644
645fn collect_top_level_license_detections_for_mode(
646    files: &[FileInfo],
647    preloaded: Vec<crate::models::TopLevelLicenseDetection>,
648    preserve_preloaded: bool,
649    clear_for_multi_input_replay: bool,
650) -> Vec<crate::models::TopLevelLicenseDetection> {
651    if clear_for_multi_input_replay {
652        Vec::new()
653    } else if preserve_preloaded {
654        preloaded
655    } else {
656        collect_top_level_license_detections(files)
657    }
658}
659
660#[cfg(feature = "golden-tests")]
661fn touch_license_golden_symbols() {
662    let _ = crate::license_detection::golden_utils::read_golden_input_content;
663    let _ = crate::license_detection::golden_utils::detect_matches_for_golden;
664    let _ = crate::license_detection::golden_utils::detect_license_expressions_for_golden;
665    let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind;
666}
667
668#[derive(Debug)]
669struct NativeScanSelection {
670    scan_path: String,
671    selected_paths: Vec<SelectedPath>,
672    collection_frontier: Vec<CollectionFrontier>,
673    missing_entries: Vec<String>,
674}
675
676fn resolve_native_scan_selection(cli: &Cli) -> Result<NativeScanSelection> {
677    if cli.paths_file.is_empty() {
678        let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?;
679        return Ok(NativeScanSelection {
680            scan_path,
681            selected_paths,
682            collection_frontier: Vec::new(),
683            missing_entries: Vec::new(),
684        });
685    }
686
687    let scan_path = cli
688        .dir_path
689        .first()
690        .cloned()
691        .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?;
692    let path_file_entries = load_paths_file_entries(&cli.paths_file)?;
693    let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?;
694    if resolved.selections.is_empty() {
695        return Err(anyhow!(
696            "--paths-file did not resolve to any existing files or directories under {:?}",
697            Path::new(&scan_path)
698        ));
699    }
700
701    Ok(NativeScanSelection {
702        scan_path,
703        selected_paths: resolved.selections,
704        collection_frontier: resolved.frontier,
705        missing_entries: resolved.missing_entries,
706    })
707}
708
709fn load_paths_file_entries(paths_files: &[String]) -> Result<Vec<String>> {
710    let mut entries = Vec::new();
711    for paths_file in paths_files {
712        let content = read_paths_file_content(paths_file)?;
713        entries.extend(content.lines().map(ToOwned::to_owned));
714    }
715    Ok(entries)
716}
717
718fn read_paths_file_content(paths_file: &str) -> Result<String> {
719    if paths_file == "-" {
720        let mut content = String::new();
721        std::io::stdin()
722            .read_to_string(&mut content)
723            .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?;
724        return Ok(content);
725    }
726
727    fs::read_to_string(paths_file)
728        .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file))
729}
730
731fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec<String> {
732    missing_entries
733        .iter()
734        .map(|entry| format!("Skipping missing --paths-file entry: {entry}"))
735        .collect()
736}
737
738fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> {
739    if cli.show_attribution {
740        return Ok(());
741    }
742
743    if cli.export_license_dataset.is_some() {
744        if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() {
745            return Err(anyhow!(
746                "--export-license-dataset does not accept scan input paths or --paths-file"
747            ));
748        }
749
750        if cli.from_json
751            || cli.license
752            || cli.package
753            || cli.system_package
754            || cli.package_in_compiled
755            || cli.package_only
756            || cli.copyright
757            || cli.email
758            || cli.url
759            || cli.generated
760            || cli.info
761            || cli.incremental
762            || cli.reindex
763            || cli.no_license_index_cache
764            || cli.license_dataset_path.is_some()
765        {
766            return Err(anyhow!(
767                "--export-license-dataset is a standalone mode and cannot be combined with scan or license-index flags"
768            ));
769        }
770
771        return Ok(());
772    }
773
774    if cli.from_json
775        && (cli.package
776            || cli.system_package
777            || cli.package_in_compiled
778            || cli.package_only
779            || cli.copyright
780            || cli.email
781            || cli.url
782            || cli.generated)
783    {
784        return Err(anyhow!(
785            "When using --from-json, file scan options like --package/--copyright/--email/--url/--generated are not allowed"
786        ));
787    }
788
789    if cli.from_json && !cli.paths_file.is_empty() {
790        return Err(anyhow!(
791            "--paths-file is only supported for native scan mode, not --from-json"
792        ));
793    }
794
795    if cli.from_json && cli.incremental {
796        return Err(anyhow!(
797            "--incremental is only supported for directory scan mode, not --from-json"
798        ));
799    }
800
801    if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 {
802        return Err(anyhow!(
803            "--paths-file requires exactly one positional scan root"
804        ));
805    }
806
807    if !cli.from_json && cli.dir_path.is_empty() {
808        return Err(anyhow!("Directory path is required for scan operations"));
809    }
810
811    if cli.tallies_by_facet && cli.facet.is_empty() {
812        return Err(anyhow!(
813            "--tallies-by-facet requires at least one --facet <facet>=<pattern> definition"
814        ));
815    }
816
817    if cli.mark_source && !cli.info {
818        return Err(anyhow!("--mark-source requires --info"));
819    }
820
821    Ok(())
822}
823
824fn prepare_cache_config(scan_root: Option<&Path>, cli: &Cli) -> Result<CacheConfig> {
825    let env_cache_dir = env::var_os(CACHE_DIR_ENV_VAR).map(PathBuf::from);
826    let config = CacheConfig::from_overrides(
827        scan_root,
828        cli.cache_dir.as_deref().map(Path::new),
829        env_cache_dir.as_deref(),
830        cli.incremental,
831    );
832
833    if cli.cache_clear {
834        crate::cache::locking::with_exclusive_cache_lock(config.root_dir(), || {
835            config.clear_contents()
836        })?;
837    }
838
839    if config.incremental_enabled() {
840        config.ensure_dirs()?;
841    }
842
843    Ok(config)
844}
845
846fn build_license_cache_config(cache_root: &CacheConfig, cli: &Cli) -> LicenseCacheConfig {
847    LicenseCacheConfig::new(
848        cache_root.root_dir().to_path_buf(),
849        cli.reindex,
850        !cli.no_license_index_cache,
851    )
852}
853
854fn partition_incremental_files(
855    collected_files: &mut Vec<(PathBuf, fs::Metadata)>,
856    scan_root: &Path,
857    manifest: Option<&IncrementalManifest>,
858) -> Vec<FileInfo> {
859    let Some(manifest) = manifest else {
860        return Vec::new();
861    };
862
863    let mut files_to_scan = Vec::new();
864    let mut reused_files = Vec::new();
865
866    for (path, metadata) in collected_files.drain(..) {
867        let relative_path = normalize_relative_scan_path(&path, scan_root);
868        let Some(entry) = manifest.entry(&relative_path) else {
869            files_to_scan.push((path, metadata));
870            continue;
871        };
872
873        match manifest_entry_matches_path(entry, &path, &metadata) {
874            Ok(true) => reused_files.push(entry.file_info.clone()),
875            Ok(false) | Err(_) => files_to_scan.push((path, metadata)),
876        }
877    }
878
879    *collected_files = files_to_scan;
880    reused_files
881}
882
883fn merge_incremental_file_results(
884    processed_files: Vec<FileInfo>,
885    reused_files: Vec<FileInfo>,
886    ordered_file_paths: &[PathBuf],
887) -> Vec<FileInfo> {
888    let mut processed_file_entries = HashMap::new();
889    let mut directory_entries = Vec::new();
890    for file in processed_files {
891        if file.file_type == FileType::File {
892            processed_file_entries.insert(file.path.clone(), file);
893        } else {
894            directory_entries.push(file);
895        }
896    }
897
898    let mut reused_file_entries: HashMap<_, _> = reused_files
899        .into_iter()
900        .map(|file| (file.path.clone(), file))
901        .collect();
902
903    let mut merged_files = Vec::new();
904    for path in ordered_file_paths {
905        let path_string = path.to_string_lossy().to_string();
906        if let Some(file) = processed_file_entries.remove(&path_string) {
907            merged_files.push(file);
908            continue;
909        }
910
911        if let Some(file) = reused_file_entries.remove(&path_string) {
912            merged_files.push(file);
913        }
914    }
915
916    merged_files.extend(processed_file_entries.into_values());
917    merged_files.extend(reused_file_entries.into_values());
918    merged_files.extend(directory_entries);
919    merged_files
920}
921
922fn build_incremental_manifest(
923    scan_root: &Path,
924    collected_files: &[(PathBuf, fs::Metadata)],
925    files: &[FileInfo],
926    options_fingerprint: &str,
927) -> IncrementalManifest {
928    let files_by_relative_path: HashMap<_, _> = files
929        .iter()
930        .filter(|file| file.file_type == FileType::File)
931        .map(|file| {
932            (
933                normalize_relative_scan_path(Path::new(&file.path), scan_root),
934                file.clone(),
935            )
936        })
937        .collect();
938
939    let entries = collected_files
940        .iter()
941        .filter_map(|(path, metadata)| {
942            let relative_path = normalize_relative_scan_path(path, scan_root);
943            let state = metadata_fingerprint(metadata)?;
944            let file_info = files_by_relative_path.get(&relative_path)?.clone();
945            let content_sha256 = file_info.sha256.unwrap_or_else(|| {
946                fs::read(path)
947                    .map(|bytes| calculate_sha256(&bytes))
948                    .unwrap_or_else(|_| {
949                        Sha256Digest::from_hex(
950                            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
951                        )
952                        .unwrap()
953                    })
954            });
955            Some((
956                relative_path,
957                IncrementalManifestEntry {
958                    state,
959                    content_sha256,
960                    file_info,
961                },
962            ))
963        })
964        .collect::<BTreeMap<_, _>>();
965
966    IncrementalManifest::new(options_fingerprint.to_string(), entries)
967}
968
969fn incremental_manifest_key(scan_root: &Path, options_fingerprint: &str) -> String {
970    let canonical_root = fs::canonicalize(scan_root).unwrap_or_else(|_| scan_root.to_path_buf());
971    calculate_sha256(
972        format!(
973            "{}\n{options_fingerprint}",
974            canonical_root.to_string_lossy()
975        )
976        .as_bytes(),
977    )
978    .as_hex()
979}
980
981fn normalize_relative_scan_path(path: &Path, scan_root: &Path) -> String {
982    path.strip_prefix(scan_root)
983        .unwrap_or(path)
984        .to_string_lossy()
985        .replace('\\', "/")
986}
987
988fn compile_regex_patterns(option_name: &str, patterns: &[String]) -> Result<Vec<Regex>> {
989    patterns
990        .iter()
991        .map(|pattern| {
992            Regex::new(pattern).map_err(|err| {
993                anyhow!("Invalid regex for {option_name} pattern \"{pattern}\": {err}")
994            })
995        })
996        .collect()
997}
998
999fn effective_timeout_seconds(process_mode: ProcessMode, timeout_seconds: f64) -> f64 {
1000    match process_mode {
1001        ProcessMode::SequentialWithoutTimeouts => 0.0,
1002        ProcessMode::Parallel(_) | ProcessMode::SequentialWithTimeouts => timeout_seconds,
1003    }
1004}
1005
1006fn process_mode_message(process_mode: ProcessMode) -> Option<&'static str> {
1007    match process_mode {
1008        ProcessMode::SequentialWithTimeouts => Some("Disabling multi-processing for debugging."),
1009        ProcessMode::SequentialWithoutTimeouts => {
1010            Some("Disabling multi-processing and multi-threading for debugging.")
1011        }
1012        ProcessMode::Parallel(_) => None,
1013    }
1014}
1015
1016fn progress_mode_from_cli(cli: &Cli) -> ProgressMode {
1017    if cli.quiet {
1018        ProgressMode::Quiet
1019    } else if cli.verbose {
1020        ProgressMode::Verbose
1021    } else {
1022        ProgressMode::Default
1023    }
1024}
1025
1026fn configured_scan_names(cli: &Cli) -> String {
1027    let mut names = Vec::new();
1028    if cli.license {
1029        names.push("licenses");
1030    }
1031    if cli.info {
1032        names.push("info");
1033    }
1034    if cli.package {
1035        names.push("packages");
1036    }
1037    if (cli.system_package || cli.package_in_compiled || cli.package_only)
1038        && !names.contains(&"packages")
1039    {
1040        names.push("packages");
1041    }
1042    if cli.copyright {
1043        names.push("copyrights");
1044    }
1045    if cli.email {
1046        names.push("emails");
1047    }
1048    if cli.url {
1049        names.push("urls");
1050    }
1051    names.join(", ")
1052}
1053
1054fn should_include_info_surface(files: &[crate::models::FileInfo], cli: &Cli) -> bool {
1055    cli.info
1056        || files.iter().any(|file| {
1057            file.date.is_some()
1058                || file.sha1.is_some()
1059                || file.md5.is_some()
1060                || file.sha256.is_some()
1061                || file.sha1_git.is_some()
1062                || file.mime_type.is_some()
1063                || file.file_type_label.is_some()
1064                || file.programming_language.is_some()
1065                || file.is_binary.is_some()
1066                || file.is_text.is_some()
1067                || file.is_archive.is_some()
1068                || file.is_media.is_some()
1069                || file.is_source.is_some()
1070                || file.is_script.is_some()
1071                || file.files_count.is_some()
1072                || file.dirs_count.is_some()
1073                || file.size_count.is_some()
1074        })
1075}
1076
1077fn record_detail_timing<T, F>(progress: &Arc<ScanProgress>, name: impl Into<String>, f: F) -> T
1078where
1079    F: FnOnce() -> T,
1080{
1081    let started = Instant::now();
1082    let result = f();
1083    progress.record_detail_timing(name.into(), started.elapsed().as_secs_f64());
1084    result
1085}
1086
1087fn run_with_thread_pool<T, F>(threads: usize, f: F) -> Result<T>
1088where
1089    F: FnOnce() -> Result<T> + Send,
1090    T: Send,
1091{
1092    let pool = rayon::ThreadPoolBuilder::new()
1093        .num_threads(threads.max(1))
1094        .build()?;
1095    pool.install(f)
1096}
1097
1098fn init_license_engine(cache_root: &CacheConfig, cli: &Cli) -> Result<Arc<LicenseDetectionEngine>> {
1099    let cache_config = build_license_cache_config(cache_root, cli);
1100
1101    match &cli.license_dataset_path {
1102        Some(p) => {
1103            let path = PathBuf::from(p);
1104            if !path.exists() {
1105                return Err(anyhow!("License dataset path does not exist: {:?}", path));
1106            }
1107            let engine = LicenseDetectionEngine::from_directory_with_cache(&path, &cache_config)?;
1108            Ok(Arc::new(engine))
1109        }
1110        None => {
1111            let engine = LicenseDetectionEngine::from_embedded_with_cache(&cache_config)?;
1112            Ok(Arc::new(engine))
1113        }
1114    }
1115}
1116
1117fn describe_license_engine_source(
1118    engine: &LicenseDetectionEngine,
1119    rules_path: Option<&str>,
1120) -> String {
1121    match rules_path {
1122        Some(path) => format!(
1123            "License detection engine initialized with {} rules from custom dataset {}",
1124            engine.index().rules_by_rid.len(),
1125            path
1126        ),
1127        None => format!(
1128            "License detection engine initialized with {} rules from embedded artifact",
1129            engine.index().rules_by_rid.len()
1130        ),
1131    }
1132}
1133
1134#[cfg(test)]
1135mod tests;