Skip to main content

provenant/
workflow.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use serde_json::{Map as JsonMap, Value as JsonValue};
5use std::path::{Path, PathBuf};
6
7use crate::app::request::{InputMode, ScanRequest};
8use crate::app::scan_pipeline::execute_request;
9use crate::license_detection::DEFAULT_LICENSEDB_URL_TEMPLATE;
10use crate::progress::ProgressMode;
11use crate::scanner::MemoryMode;
12use crate::{Output, ProcessMode};
13
14#[derive(Debug, thiserror::Error)]
15pub enum WorkflowError {
16    #[error("{0}")]
17    InvalidOptions(String),
18    #[error(transparent)]
19    Pipeline(#[from] anyhow::Error),
20}
21
22/// Selects how the workflow facade sources license rules.
23#[derive(Debug, Clone)]
24pub enum LicenseSource {
25    /// Skip license detection entirely.
26    Disabled,
27    /// Use the embedded Provenant license dataset.
28    Embedded,
29    /// Load a custom dataset from a directory containing the expected rules and licenses layout.
30    Directory(PathBuf),
31}
32
33/// High-level configuration for in-process scans through [`scan_path`] and [`scan_paths`].
34///
35/// Defaults stay intentionally conservative: progress is quiet, no scan dimensions are enabled,
36/// input headers are omitted, and ambient `PROVENANT_CACHE` is ignored unless you set
37/// [`ScanOptions::cache_dir`].
38#[derive(Debug, Clone)]
39pub struct ScanOptions {
40    pub progress_mode: ProgressMode,
41    pub process_mode: ProcessMode,
42    pub timeout_seconds: f64,
43    pub max_depth: usize,
44    pub max_in_memory: MemoryMode,
45    pub collect_info: bool,
46    pub detect_license: LicenseSource,
47    pub detect_packages: bool,
48    pub detect_system_packages: bool,
49    pub detect_packages_in_compiled: bool,
50    pub package_only: bool,
51    pub no_assemble: bool,
52    pub detect_copyrights: bool,
53    pub detect_emails: bool,
54    pub detect_urls: bool,
55    pub detect_generated: bool,
56    pub max_emails: usize,
57    pub max_urls: usize,
58    pub include: Vec<String>,
59    pub exclude: Vec<String>,
60    pub include_input_header: bool,
61    pub cache_dir: Option<PathBuf>,
62    pub cache_clear: bool,
63    pub incremental: bool,
64    pub reindex: bool,
65    pub no_license_index_cache: bool,
66    pub license_text: bool,
67    pub license_text_diagnostics: bool,
68    pub license_diagnostics: bool,
69    pub unknown_licenses: bool,
70    pub license_score: u8,
71    pub filter_clues: bool,
72    pub ignore_author_patterns: Vec<String>,
73    pub ignore_copyright_holder_patterns: Vec<String>,
74    pub only_findings: bool,
75    pub mark_source: bool,
76    pub classify: bool,
77    pub summary: bool,
78    pub license_clarity_score: bool,
79    pub license_references: bool,
80    pub license_url_template: String,
81    pub license_policy: Option<PathBuf>,
82    pub tallies: bool,
83    pub tallies_key_files: bool,
84    pub tallies_with_details: bool,
85    pub facets: Vec<String>,
86    pub tallies_by_facet: bool,
87    pub strip_root: bool,
88    pub full_root: bool,
89    pub header_options: JsonMap<String, JsonValue>,
90}
91
92impl Default for ScanOptions {
93    fn default() -> Self {
94        Self {
95            progress_mode: ProgressMode::Quiet,
96            process_mode: ProcessMode::default(),
97            timeout_seconds: 120.0,
98            max_depth: 0,
99            max_in_memory: MemoryMode::Limit(10_000),
100            collect_info: false,
101            detect_license: LicenseSource::Disabled,
102            detect_packages: false,
103            detect_system_packages: false,
104            detect_packages_in_compiled: false,
105            package_only: false,
106            no_assemble: false,
107            detect_copyrights: false,
108            detect_emails: false,
109            detect_urls: false,
110            detect_generated: false,
111            max_emails: 50,
112            max_urls: 50,
113            include: Vec::new(),
114            exclude: Vec::new(),
115            include_input_header: false,
116            cache_dir: None,
117            cache_clear: false,
118            incremental: false,
119            reindex: false,
120            no_license_index_cache: false,
121            license_text: false,
122            license_text_diagnostics: false,
123            license_diagnostics: false,
124            unknown_licenses: false,
125            license_score: 0,
126            filter_clues: false,
127            ignore_author_patterns: Vec::new(),
128            ignore_copyright_holder_patterns: Vec::new(),
129            only_findings: false,
130            mark_source: false,
131            classify: false,
132            summary: false,
133            license_clarity_score: false,
134            license_references: false,
135            license_url_template: DEFAULT_LICENSEDB_URL_TEMPLATE.to_string(),
136            license_policy: None,
137            tallies: false,
138            tallies_key_files: false,
139            tallies_with_details: false,
140            facets: Vec::new(),
141            tallies_by_facet: false,
142            strip_root: false,
143            full_root: false,
144            header_options: JsonMap::new(),
145        }
146    }
147}
148
149/// Scan a single native filesystem input through the supported high-level workflow facade.
150///
151/// ```
152/// use provenant::workflow::{scan_path, ScanOptions};
153/// use std::fs;
154/// use tempfile::tempdir;
155///
156/// let root = tempdir()?;
157/// let root = root.path();
158/// fs::write(root.join("README.txt"), "hello from doctest\n")?;
159///
160/// let output = scan_path(&root, &ScanOptions::default())?;
161/// assert!(output.files.iter().any(|file| file.path.ends_with("README.txt")));
162/// assert_eq!(output.headers.len(), 1);
163/// assert!(!output.headers[0].options.contains_key("input"));
164/// # Ok::<(), Box<dyn std::error::Error>>(())
165/// ```
166pub fn scan_path(path: impl AsRef<Path>, options: &ScanOptions) -> Result<Output, WorkflowError> {
167    scan_paths([path.as_ref()], options)
168}
169
170/// Scan multiple native filesystem inputs in one in-process workflow run.
171///
172/// Absolute paths are supported as long as they can be resolved through a shared scan root by the
173/// internal pipeline.
174///
175/// ```
176/// use provenant::workflow::{scan_paths, ScanOptions};
177/// use std::fs;
178/// use tempfile::tempdir;
179///
180/// let root = tempdir()?;
181/// let root = root.path();
182/// let left = root.join("left");
183/// let right = root.join("right");
184/// fs::create_dir_all(&left)?;
185/// fs::create_dir_all(&right)?;
186/// fs::write(left.join("one.txt"), "left\n")?;
187/// fs::write(right.join("two.txt"), "right\n")?;
188///
189/// let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())?;
190/// let paths: Vec<_> = output.files.iter().map(|file| file.path.as_str()).collect();
191/// assert!(paths.iter().any(|path| path.ends_with("one.txt")));
192/// assert!(paths.iter().any(|path| path.ends_with("two.txt")));
193/// # Ok::<(), Box<dyn std::error::Error>>(())
194/// ```
195pub fn scan_paths<'a>(
196    paths: impl IntoIterator<Item = &'a Path>,
197    options: &ScanOptions,
198) -> Result<Output, WorkflowError> {
199    let input_paths: Vec<String> = paths
200        .into_iter()
201        .map(|path| path.to_string_lossy().to_string())
202        .collect();
203
204    if input_paths.is_empty() {
205        return Err(WorkflowError::InvalidOptions(
206            "At least one input path is required".to_string(),
207        ));
208    }
209
210    let request = request_for_native_paths(input_paths, options);
211    validate_workflow_request(&request)?;
212
213    execute_request(&request)
214        .map(|executed| executed.output)
215        .map_err(WorkflowError::Pipeline)
216}
217
218fn request_for_native_paths(input_paths: Vec<String>, options: &ScanOptions) -> ScanRequest {
219    let mut header_options = options.header_options.clone();
220    if options.include_input_header {
221        header_options.insert(
222            "input".to_string(),
223            JsonValue::Array(input_paths.iter().cloned().map(JsonValue::String).collect()),
224        );
225    }
226
227    let (license, license_dataset_path) = match &options.detect_license {
228        LicenseSource::Disabled => (false, None),
229        LicenseSource::Embedded => (true, None),
230        LicenseSource::Directory(path) => (true, Some(path.to_string_lossy().to_string())),
231    };
232
233    ScanRequest {
234        input_paths,
235        input_mode: InputMode::Native,
236        output_targets: Vec::new(),
237        output_header_options: header_options,
238        progress_mode: options.progress_mode,
239        process_mode: options.process_mode,
240        timeout_seconds: options.timeout_seconds,
241        quiet: matches!(options.progress_mode, ProgressMode::Quiet),
242        verbose: matches!(options.progress_mode, ProgressMode::Verbose),
243        strip_root: options.strip_root,
244        full_root: options.full_root,
245        include: options.include.clone(),
246        exclude: options.exclude.clone(),
247        paths_files: Vec::new(),
248        respect_process_cache_env: false,
249        cache_dir: options
250            .cache_dir
251            .as_ref()
252            .map(|path| path.to_string_lossy().to_string()),
253        cache_clear: options.cache_clear,
254        incremental: options.incremental,
255        max_depth: options.max_depth,
256        max_in_memory: options.max_in_memory,
257        info: options.collect_info,
258        package: options.detect_packages,
259        system_package: options.detect_system_packages,
260        package_in_compiled: options.detect_packages_in_compiled,
261        package_only: options.package_only,
262        no_assemble: options.no_assemble,
263        license_dataset_path,
264        reindex: options.reindex,
265        no_license_index_cache: options.no_license_index_cache,
266        license_text: options.license_text,
267        license_text_diagnostics: options.license_text_diagnostics,
268        license_diagnostics: options.license_diagnostics,
269        unknown_licenses: options.unknown_licenses,
270        license_score: options.license_score,
271        license_url_template: options.license_url_template.clone(),
272        filter_clues: options.filter_clues,
273        ignore_author: options.ignore_author_patterns.clone(),
274        ignore_copyright_holder: options.ignore_copyright_holder_patterns.clone(),
275        only_findings: options.only_findings,
276        mark_source: options.mark_source,
277        classify: options.classify,
278        summary: options.summary,
279        license_clarity_score: options.license_clarity_score,
280        license_references: options.license_references,
281        license_policy: options
282            .license_policy
283            .as_ref()
284            .map(|path| path.to_string_lossy().to_string()),
285        tallies: options.tallies,
286        tallies_key_files: options.tallies_key_files,
287        tallies_with_details: options.tallies_with_details,
288        facet: options.facets.clone(),
289        tallies_by_facet: options.tallies_by_facet,
290        generated: options.detect_generated,
291        license,
292        copyright: options.detect_copyrights,
293        email: options.detect_emails,
294        max_email: options.max_emails,
295        url: options.detect_urls,
296        max_url: options.max_urls,
297    }
298}
299
300fn validate_workflow_request(request: &ScanRequest) -> Result<(), WorkflowError> {
301    let license_enabled = request.license;
302
303    if request.strip_root && request.full_root {
304        return Err(WorkflowError::InvalidOptions(
305            "strip_root and full_root are mutually exclusive".to_string(),
306        ));
307    }
308
309    if request.license_text && !license_enabled {
310        return Err(WorkflowError::InvalidOptions(
311            "license_text requires detect_license".to_string(),
312        ));
313    }
314
315    if request.license_text_diagnostics && !request.license_text {
316        return Err(WorkflowError::InvalidOptions(
317            "license_text_diagnostics requires license_text".to_string(),
318        ));
319    }
320
321    if request.license_diagnostics && !license_enabled {
322        return Err(WorkflowError::InvalidOptions(
323            "license_diagnostics requires detect_license".to_string(),
324        ));
325    }
326
327    if request.unknown_licenses && !license_enabled {
328        return Err(WorkflowError::InvalidOptions(
329            "unknown_licenses requires detect_license".to_string(),
330        ));
331    }
332
333    if request.license_references && !license_enabled {
334        return Err(WorkflowError::InvalidOptions(
335            "license_references requires detect_license".to_string(),
336        ));
337    }
338
339    if request.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE && !license_enabled {
340        return Err(WorkflowError::InvalidOptions(
341            "license_url_template requires detect_license".to_string(),
342        ));
343    }
344
345    if request.package_only && license_enabled {
346        return Err(WorkflowError::InvalidOptions(
347            "package_only cannot be combined with detect_license".to_string(),
348        ));
349    }
350
351    if request.package_only && request.summary {
352        return Err(WorkflowError::InvalidOptions(
353            "package_only cannot be combined with summary".to_string(),
354        ));
355    }
356
357    if request.package_only && request.package {
358        return Err(WorkflowError::InvalidOptions(
359            "package_only cannot be combined with detect_packages".to_string(),
360        ));
361    }
362
363    if request.package_only && request.system_package {
364        return Err(WorkflowError::InvalidOptions(
365            "package_only cannot be combined with detect_system_packages".to_string(),
366        ));
367    }
368
369    if request.summary && !request.classify {
370        return Err(WorkflowError::InvalidOptions(
371            "summary requires classify".to_string(),
372        ));
373    }
374
375    if request.license_clarity_score && !request.classify {
376        return Err(WorkflowError::InvalidOptions(
377            "license_clarity_score requires classify".to_string(),
378        ));
379    }
380
381    if request.tallies_key_files && !(request.tallies && request.classify) {
382        return Err(WorkflowError::InvalidOptions(
383            "tallies_key_files requires tallies and classify".to_string(),
384        ));
385    }
386
387    if request.tallies_by_facet && request.facet.is_empty() {
388        return Err(WorkflowError::InvalidOptions(
389            "tallies_by_facet requires at least one facet definition".to_string(),
390        ));
391    }
392
393    if request.tallies_by_facet && !request.tallies {
394        return Err(WorkflowError::InvalidOptions(
395            "tallies_by_facet requires tallies".to_string(),
396        ));
397    }
398
399    if request.mark_source && !request.info {
400        return Err(WorkflowError::InvalidOptions(
401            "mark_source requires collect_info".to_string(),
402        ));
403    }
404
405    if request.license_score > 100 {
406        return Err(WorkflowError::InvalidOptions(
407            "license_score must be between 0 and 100".to_string(),
408        ));
409    }
410
411    Ok(())
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417    use std::fs;
418
419    #[test]
420    fn scan_path_requires_at_least_one_input() {
421        let result = scan_paths(std::iter::empty::<&Path>(), &ScanOptions::default());
422        assert!(result.is_err());
423    }
424
425    #[test]
426    fn workflow_request_populates_input_header() {
427        let options = ScanOptions {
428            include_input_header: true,
429            ..ScanOptions::default()
430        };
431        let request = request_for_native_paths(vec!["src".to_string()], &options);
432        assert!(request.output_header_options.contains_key("input"));
433    }
434
435    #[test]
436    fn workflow_validation_rejects_license_dependent_flags_without_license() {
437        let options = ScanOptions {
438            license_references: true,
439            ..ScanOptions::default()
440        };
441
442        let request = request_for_native_paths(vec!["src".to_string()], &options);
443        let error = validate_workflow_request(&request).expect_err("validation should fail");
444        assert!(matches!(error, WorkflowError::InvalidOptions(_)));
445        assert!(
446            error
447                .to_string()
448                .contains("license_references requires detect_license")
449        );
450    }
451
452    #[test]
453    fn workflow_validation_rejects_package_only_with_regular_package_modes() {
454        let options = ScanOptions {
455            package_only: true,
456            detect_packages: true,
457            ..ScanOptions::default()
458        };
459
460        let request = request_for_native_paths(vec!["src".to_string()], &options);
461        let error = validate_workflow_request(&request).expect_err("validation should fail");
462        assert!(matches!(error, WorkflowError::InvalidOptions(_)));
463        assert!(
464            error
465                .to_string()
466                .contains("package_only cannot be combined with detect_packages")
467        );
468    }
469
470    #[test]
471    fn workflow_validation_rejects_classify_dependent_flags_without_classify() {
472        let options = ScanOptions {
473            summary: true,
474            ..ScanOptions::default()
475        };
476
477        let request = request_for_native_paths(vec!["src".to_string()], &options);
478        let error = validate_workflow_request(&request).expect_err("validation should fail");
479        assert!(matches!(error, WorkflowError::InvalidOptions(_)));
480        assert!(error.to_string().contains("summary requires classify"));
481    }
482
483    #[test]
484    fn scan_path_runs_a_basic_in_process_scan() {
485        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
486        fs::write(
487            temp_dir.path().join("README.txt"),
488            "hello from workflow facade\n",
489        )
490        .expect("write fixture file");
491
492        let options = ScanOptions {
493            collect_info: true,
494            include_input_header: true,
495            ..ScanOptions::default()
496        };
497
498        let output = scan_path(temp_dir.path(), &options).expect("workflow scan should succeed");
499
500        assert_eq!(output.headers.len(), 1);
501        assert!(!output.files.is_empty());
502        assert!(output.headers[0].options.contains_key("input"));
503    }
504
505    #[test]
506    fn scan_paths_supports_multiple_absolute_inputs() {
507        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
508        let left = temp_dir.path().join("left");
509        let right = temp_dir.path().join("right");
510        fs::create_dir_all(&left).expect("create left dir");
511        fs::create_dir_all(&right).expect("create right dir");
512        fs::write(left.join("one.txt"), "left\n").expect("write left fixture");
513        fs::write(right.join("two.txt"), "right\n").expect("write right fixture");
514
515        let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())
516            .expect("workflow scan should succeed for multiple absolute inputs");
517
518        assert!(
519            output
520                .files
521                .iter()
522                .any(|file| file.path.ends_with("one.txt"))
523        );
524        assert!(
525            output
526                .files
527                .iter()
528                .any(|file| file.path.ends_with("two.txt"))
529        );
530    }
531}