Skip to main content

provenant/
workflow.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use anyhow::{Result, anyhow};
5use serde_json::{Map as JsonMap, Value as JsonValue};
6use std::path::{Path, PathBuf};
7
8use crate::app::request::{InputMode, ScanRequest};
9use crate::app::scan_pipeline::execute_request;
10use crate::license_detection::DEFAULT_LICENSEDB_URL_TEMPLATE;
11use crate::progress::ProgressMode;
12use crate::scanner::MemoryMode;
13use crate::{Output, ProcessMode};
14
15/// Selects how the workflow facade sources license rules.
16#[derive(Debug, Clone)]
17pub enum LicenseSource {
18    /// Skip license detection entirely.
19    Disabled,
20    /// Use the embedded Provenant license dataset.
21    Embedded,
22    /// Load a custom dataset from a directory containing the expected rules and licenses layout.
23    Directory(PathBuf),
24}
25
26/// High-level configuration for in-process scans through [`scan_path`] and [`scan_paths`].
27///
28/// Defaults stay intentionally conservative: progress is quiet, no scan dimensions are enabled,
29/// input headers are omitted, and ambient `PROVENANT_CACHE` is ignored unless you set
30/// [`ScanOptions::cache_dir`].
31#[derive(Debug, Clone)]
32pub struct ScanOptions {
33    pub progress_mode: ProgressMode,
34    pub process_mode: ProcessMode,
35    pub timeout_seconds: f64,
36    pub max_depth: usize,
37    pub max_in_memory: MemoryMode,
38    pub collect_info: bool,
39    pub detect_license: LicenseSource,
40    pub detect_packages: bool,
41    pub detect_system_packages: bool,
42    pub detect_packages_in_compiled: bool,
43    pub package_only: bool,
44    pub no_assemble: bool,
45    pub detect_copyrights: bool,
46    pub detect_emails: bool,
47    pub detect_urls: bool,
48    pub detect_generated: bool,
49    pub max_emails: usize,
50    pub max_urls: usize,
51    pub include: Vec<String>,
52    pub exclude: Vec<String>,
53    pub include_input_header: bool,
54    pub cache_dir: Option<PathBuf>,
55    pub cache_clear: bool,
56    pub incremental: bool,
57    pub reindex: bool,
58    pub no_license_index_cache: bool,
59    pub license_text: bool,
60    pub license_text_diagnostics: bool,
61    pub license_diagnostics: bool,
62    pub unknown_licenses: bool,
63    pub license_score: u8,
64    pub filter_clues: bool,
65    pub ignore_author_patterns: Vec<String>,
66    pub ignore_copyright_holder_patterns: Vec<String>,
67    pub only_findings: bool,
68    pub mark_source: bool,
69    pub classify: bool,
70    pub summary: bool,
71    pub license_clarity_score: bool,
72    pub license_references: bool,
73    pub license_url_template: String,
74    pub license_policy: Option<PathBuf>,
75    pub tallies: bool,
76    pub tallies_key_files: bool,
77    pub tallies_with_details: bool,
78    pub facets: Vec<String>,
79    pub tallies_by_facet: bool,
80    pub strip_root: bool,
81    pub full_root: bool,
82    pub header_options: JsonMap<String, JsonValue>,
83}
84
85impl Default for ScanOptions {
86    fn default() -> Self {
87        Self {
88            progress_mode: ProgressMode::Quiet,
89            process_mode: ProcessMode::default(),
90            timeout_seconds: 120.0,
91            max_depth: 0,
92            max_in_memory: MemoryMode::Limit(10_000),
93            collect_info: false,
94            detect_license: LicenseSource::Disabled,
95            detect_packages: false,
96            detect_system_packages: false,
97            detect_packages_in_compiled: false,
98            package_only: false,
99            no_assemble: false,
100            detect_copyrights: false,
101            detect_emails: false,
102            detect_urls: false,
103            detect_generated: false,
104            max_emails: 50,
105            max_urls: 50,
106            include: Vec::new(),
107            exclude: Vec::new(),
108            include_input_header: false,
109            cache_dir: None,
110            cache_clear: false,
111            incremental: false,
112            reindex: false,
113            no_license_index_cache: false,
114            license_text: false,
115            license_text_diagnostics: false,
116            license_diagnostics: false,
117            unknown_licenses: false,
118            license_score: 0,
119            filter_clues: false,
120            ignore_author_patterns: Vec::new(),
121            ignore_copyright_holder_patterns: Vec::new(),
122            only_findings: false,
123            mark_source: false,
124            classify: false,
125            summary: false,
126            license_clarity_score: false,
127            license_references: false,
128            license_url_template: DEFAULT_LICENSEDB_URL_TEMPLATE.to_string(),
129            license_policy: None,
130            tallies: false,
131            tallies_key_files: false,
132            tallies_with_details: false,
133            facets: Vec::new(),
134            tallies_by_facet: false,
135            strip_root: false,
136            full_root: false,
137            header_options: JsonMap::new(),
138        }
139    }
140}
141
142/// Scan a single native filesystem input through the supported high-level workflow facade.
143///
144/// ```
145/// use provenant::workflow::{scan_path, ScanOptions};
146/// use std::fs;
147/// use tempfile::tempdir;
148///
149/// let root = tempdir()?;
150/// let root = root.path();
151/// fs::write(root.join("README.txt"), "hello from doctest\n")?;
152///
153/// let output = scan_path(&root, &ScanOptions::default())?;
154/// assert!(output.files.iter().any(|file| file.path.ends_with("README.txt")));
155/// assert_eq!(output.headers.len(), 1);
156/// assert!(!output.headers[0].options.contains_key("input"));
157/// # Ok::<(), Box<dyn std::error::Error>>(())
158/// ```
159pub fn scan_path(path: impl AsRef<Path>, options: &ScanOptions) -> Result<Output> {
160    scan_paths([path.as_ref()], options)
161}
162
163/// Scan multiple native filesystem inputs in one in-process workflow run.
164///
165/// Absolute paths are supported as long as they can be resolved through a shared scan root by the
166/// internal pipeline.
167///
168/// ```
169/// use provenant::workflow::{scan_paths, ScanOptions};
170/// use std::fs;
171/// use tempfile::tempdir;
172///
173/// let root = tempdir()?;
174/// let root = root.path();
175/// let left = root.join("left");
176/// let right = root.join("right");
177/// fs::create_dir_all(&left)?;
178/// fs::create_dir_all(&right)?;
179/// fs::write(left.join("one.txt"), "left\n")?;
180/// fs::write(right.join("two.txt"), "right\n")?;
181///
182/// let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())?;
183/// let paths: Vec<_> = output.files.iter().map(|file| file.path.as_str()).collect();
184/// assert!(paths.iter().any(|path| path.ends_with("one.txt")));
185/// assert!(paths.iter().any(|path| path.ends_with("two.txt")));
186/// # Ok::<(), Box<dyn std::error::Error>>(())
187/// ```
188pub fn scan_paths<'a>(
189    paths: impl IntoIterator<Item = &'a Path>,
190    options: &ScanOptions,
191) -> Result<Output> {
192    let input_paths: Vec<String> = paths
193        .into_iter()
194        .map(|path| path.to_string_lossy().to_string())
195        .collect();
196
197    if input_paths.is_empty() {
198        return Err(anyhow!("At least one input path is required"));
199    }
200
201    let request = request_for_native_paths(input_paths, options);
202    validate_workflow_request(&request)?;
203
204    execute_request(&request).map(|executed| executed.output)
205}
206
207fn request_for_native_paths(input_paths: Vec<String>, options: &ScanOptions) -> ScanRequest {
208    let mut header_options = options.header_options.clone();
209    if options.include_input_header {
210        header_options.insert(
211            "input".to_string(),
212            JsonValue::Array(input_paths.iter().cloned().map(JsonValue::String).collect()),
213        );
214    }
215
216    let (license, license_dataset_path) = match &options.detect_license {
217        LicenseSource::Disabled => (false, None),
218        LicenseSource::Embedded => (true, None),
219        LicenseSource::Directory(path) => (true, Some(path.to_string_lossy().to_string())),
220    };
221
222    ScanRequest {
223        input_paths,
224        input_mode: InputMode::Native,
225        output_targets: Vec::new(),
226        output_header_options: header_options,
227        progress_mode: options.progress_mode,
228        process_mode: options.process_mode,
229        timeout_seconds: options.timeout_seconds,
230        quiet: matches!(options.progress_mode, ProgressMode::Quiet),
231        verbose: matches!(options.progress_mode, ProgressMode::Verbose),
232        strip_root: options.strip_root,
233        full_root: options.full_root,
234        include: options.include.clone(),
235        exclude: options.exclude.clone(),
236        paths_files: Vec::new(),
237        respect_process_cache_env: false,
238        cache_dir: options
239            .cache_dir
240            .as_ref()
241            .map(|path| path.to_string_lossy().to_string()),
242        cache_clear: options.cache_clear,
243        incremental: options.incremental,
244        max_depth: options.max_depth,
245        max_in_memory: options.max_in_memory,
246        info: options.collect_info,
247        package: options.detect_packages,
248        system_package: options.detect_system_packages,
249        package_in_compiled: options.detect_packages_in_compiled,
250        package_only: options.package_only,
251        no_assemble: options.no_assemble,
252        license_dataset_path,
253        reindex: options.reindex,
254        no_license_index_cache: options.no_license_index_cache,
255        license_text: options.license_text,
256        license_text_diagnostics: options.license_text_diagnostics,
257        license_diagnostics: options.license_diagnostics,
258        unknown_licenses: options.unknown_licenses,
259        license_score: options.license_score,
260        license_url_template: options.license_url_template.clone(),
261        filter_clues: options.filter_clues,
262        ignore_author: options.ignore_author_patterns.clone(),
263        ignore_copyright_holder: options.ignore_copyright_holder_patterns.clone(),
264        only_findings: options.only_findings,
265        mark_source: options.mark_source,
266        classify: options.classify,
267        summary: options.summary,
268        license_clarity_score: options.license_clarity_score,
269        license_references: options.license_references,
270        license_policy: options
271            .license_policy
272            .as_ref()
273            .map(|path| path.to_string_lossy().to_string()),
274        tallies: options.tallies,
275        tallies_key_files: options.tallies_key_files,
276        tallies_with_details: options.tallies_with_details,
277        facet: options.facets.clone(),
278        tallies_by_facet: options.tallies_by_facet,
279        generated: options.detect_generated,
280        license,
281        copyright: options.detect_copyrights,
282        email: options.detect_emails,
283        max_email: options.max_emails,
284        url: options.detect_urls,
285        max_url: options.max_urls,
286    }
287}
288
289fn validate_workflow_request(request: &ScanRequest) -> Result<()> {
290    let license_enabled = request.license;
291
292    if request.strip_root && request.full_root {
293        return Err(anyhow!("strip_root and full_root are mutually exclusive"));
294    }
295
296    if request.license_text && !license_enabled {
297        return Err(anyhow!("license_text requires detect_license"));
298    }
299
300    if request.license_text_diagnostics && !request.license_text {
301        return Err(anyhow!("license_text_diagnostics requires license_text"));
302    }
303
304    if request.license_diagnostics && !license_enabled {
305        return Err(anyhow!("license_diagnostics requires detect_license"));
306    }
307
308    if request.unknown_licenses && !license_enabled {
309        return Err(anyhow!("unknown_licenses requires detect_license"));
310    }
311
312    if request.license_references && !license_enabled {
313        return Err(anyhow!("license_references requires detect_license"));
314    }
315
316    if request.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE && !license_enabled {
317        return Err(anyhow!("license_url_template requires detect_license"));
318    }
319
320    if request.package_only && license_enabled {
321        return Err(anyhow!(
322            "package_only cannot be combined with detect_license"
323        ));
324    }
325
326    if request.package_only && request.summary {
327        return Err(anyhow!("package_only cannot be combined with summary"));
328    }
329
330    if request.package_only && request.package {
331        return Err(anyhow!(
332            "package_only cannot be combined with detect_packages"
333        ));
334    }
335
336    if request.package_only && request.system_package {
337        return Err(anyhow!(
338            "package_only cannot be combined with detect_system_packages"
339        ));
340    }
341
342    if request.summary && !request.classify {
343        return Err(anyhow!("summary requires classify"));
344    }
345
346    if request.license_clarity_score && !request.classify {
347        return Err(anyhow!("license_clarity_score requires classify"));
348    }
349
350    if request.tallies_key_files && !(request.tallies && request.classify) {
351        return Err(anyhow!("tallies_key_files requires tallies and classify"));
352    }
353
354    if request.tallies_by_facet && request.facet.is_empty() {
355        return Err(anyhow!(
356            "tallies_by_facet requires at least one facet definition"
357        ));
358    }
359
360    if request.tallies_by_facet && !request.tallies {
361        return Err(anyhow!("tallies_by_facet requires tallies"));
362    }
363
364    if request.mark_source && !request.info {
365        return Err(anyhow!("mark_source requires collect_info"));
366    }
367
368    if request.license_score > 100 {
369        return Err(anyhow!("license_score must be between 0 and 100"));
370    }
371
372    Ok(())
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378    use std::fs;
379
380    #[test]
381    fn scan_path_requires_at_least_one_input() {
382        let result = scan_paths(std::iter::empty::<&Path>(), &ScanOptions::default());
383        assert!(result.is_err());
384    }
385
386    #[test]
387    fn workflow_request_populates_input_header() {
388        let options = ScanOptions {
389            include_input_header: true,
390            ..ScanOptions::default()
391        };
392        let request = request_for_native_paths(vec!["src".to_string()], &options);
393        assert!(request.output_header_options.contains_key("input"));
394    }
395
396    #[test]
397    fn workflow_validation_rejects_license_dependent_flags_without_license() {
398        let options = ScanOptions {
399            license_references: true,
400            ..ScanOptions::default()
401        };
402
403        let request = request_for_native_paths(vec!["src".to_string()], &options);
404        let error = validate_workflow_request(&request).expect_err("validation should fail");
405        assert!(
406            error
407                .to_string()
408                .contains("license_references requires detect_license")
409        );
410    }
411
412    #[test]
413    fn workflow_validation_rejects_package_only_with_regular_package_modes() {
414        let options = ScanOptions {
415            package_only: true,
416            detect_packages: true,
417            ..ScanOptions::default()
418        };
419
420        let request = request_for_native_paths(vec!["src".to_string()], &options);
421        let error = validate_workflow_request(&request).expect_err("validation should fail");
422        assert!(
423            error
424                .to_string()
425                .contains("package_only cannot be combined with detect_packages")
426        );
427    }
428
429    #[test]
430    fn workflow_validation_rejects_classify_dependent_flags_without_classify() {
431        let options = ScanOptions {
432            summary: true,
433            ..ScanOptions::default()
434        };
435
436        let request = request_for_native_paths(vec!["src".to_string()], &options);
437        let error = validate_workflow_request(&request).expect_err("validation should fail");
438        assert!(error.to_string().contains("summary requires classify"));
439    }
440
441    #[test]
442    fn scan_path_runs_a_basic_in_process_scan() {
443        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
444        fs::write(
445            temp_dir.path().join("README.txt"),
446            "hello from workflow facade\n",
447        )
448        .expect("write fixture file");
449
450        let options = ScanOptions {
451            collect_info: true,
452            include_input_header: true,
453            ..ScanOptions::default()
454        };
455
456        let output = scan_path(temp_dir.path(), &options).expect("workflow scan should succeed");
457
458        assert_eq!(output.headers.len(), 1);
459        assert!(!output.files.is_empty());
460        assert!(output.headers[0].options.contains_key("input"));
461    }
462
463    #[test]
464    fn scan_paths_supports_multiple_absolute_inputs() {
465        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
466        let left = temp_dir.path().join("left");
467        let right = temp_dir.path().join("right");
468        fs::create_dir_all(&left).expect("create left dir");
469        fs::create_dir_all(&right).expect("create right dir");
470        fs::write(left.join("one.txt"), "left\n").expect("write left fixture");
471        fs::write(right.join("two.txt"), "right\n").expect("write right fixture");
472
473        let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())
474            .expect("workflow scan should succeed for multiple absolute inputs");
475
476        assert!(
477            output
478                .files
479                .iter()
480                .any(|file| file.path.ends_with("one.txt"))
481        );
482        assert!(
483            output
484                .files
485                .iter()
486                .any(|file| file.path.ends_with("two.txt"))
487        );
488    }
489}