Skip to main content

provenant/
workflow.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use anyhow::{Result, anyhow};
5use serde_json::{Map as JsonMap, Value as JsonValue};
6use std::path::{Path, PathBuf};
7
8use crate::app::request::{InputMode, ScanRequest};
9use crate::app::scan_pipeline::execute_request;
10use crate::license_detection::DEFAULT_LICENSEDB_URL_TEMPLATE;
11use crate::progress::ProgressMode;
12use crate::scanner::MemoryMode;
13use crate::{Output, ProcessMode};
14
15/// Selects how the workflow facade sources license rules.
16#[derive(Debug, Clone)]
17pub enum LicenseSource {
18    /// Skip license detection entirely.
19    Disabled,
20    /// Use the embedded Provenant license dataset.
21    Embedded,
22    /// Load a custom dataset from a directory containing the expected rules and licenses layout.
23    Directory(PathBuf),
24}
25
26/// High-level configuration for in-process scans through [`scan_path`] and [`scan_paths`].
27///
28/// Defaults stay intentionally conservative: progress is quiet, no scan dimensions are enabled,
29/// input headers are omitted, and ambient `PROVENANT_CACHE` is ignored unless you set
30/// [`ScanOptions::cache_dir`].
31#[derive(Debug, Clone)]
32pub struct ScanOptions {
33    pub progress_mode: ProgressMode,
34    pub process_mode: ProcessMode,
35    pub timeout_seconds: f64,
36    pub max_depth: usize,
37    pub max_in_memory: MemoryMode,
38    pub collect_info: bool,
39    pub detect_license: LicenseSource,
40    pub detect_packages: bool,
41    pub detect_system_packages: bool,
42    pub detect_packages_in_compiled: bool,
43    pub package_only: bool,
44    pub no_assemble: bool,
45    pub detect_copyrights: bool,
46    pub detect_emails: bool,
47    pub detect_urls: bool,
48    pub detect_generated: bool,
49    pub max_emails: usize,
50    pub max_urls: usize,
51    pub include: Vec<String>,
52    pub exclude: Vec<String>,
53    pub include_input_header: bool,
54    pub cache_dir: Option<PathBuf>,
55    pub cache_clear: bool,
56    pub incremental: bool,
57    pub reindex: bool,
58    pub no_license_index_cache: bool,
59    pub license_text: bool,
60    pub license_text_diagnostics: bool,
61    pub license_diagnostics: bool,
62    pub unknown_licenses: bool,
63    pub license_score: u8,
64    pub filter_clues: bool,
65    pub ignore_author_patterns: Vec<String>,
66    pub ignore_copyright_holder_patterns: Vec<String>,
67    pub only_findings: bool,
68    pub mark_source: bool,
69    pub classify: bool,
70    pub summary: bool,
71    pub license_clarity_score: bool,
72    pub license_references: bool,
73    pub license_url_template: String,
74    pub license_policy: Option<PathBuf>,
75    pub tallies: bool,
76    pub tallies_key_files: bool,
77    pub tallies_with_details: bool,
78    pub facets: Vec<String>,
79    pub tallies_by_facet: bool,
80    pub strip_root: bool,
81    pub full_root: bool,
82    pub header_options: JsonMap<String, JsonValue>,
83}
84
85impl Default for ScanOptions {
86    fn default() -> Self {
87        Self {
88            progress_mode: ProgressMode::Quiet,
89            process_mode: ProcessMode::default(),
90            timeout_seconds: 120.0,
91            max_depth: 0,
92            max_in_memory: MemoryMode::Limit(10_000),
93            collect_info: false,
94            detect_license: LicenseSource::Disabled,
95            detect_packages: false,
96            detect_system_packages: false,
97            detect_packages_in_compiled: false,
98            package_only: false,
99            no_assemble: false,
100            detect_copyrights: false,
101            detect_emails: false,
102            detect_urls: false,
103            detect_generated: false,
104            max_emails: 50,
105            max_urls: 50,
106            include: Vec::new(),
107            exclude: Vec::new(),
108            include_input_header: false,
109            cache_dir: None,
110            cache_clear: false,
111            incremental: false,
112            reindex: false,
113            no_license_index_cache: false,
114            license_text: false,
115            license_text_diagnostics: false,
116            license_diagnostics: false,
117            unknown_licenses: false,
118            license_score: 0,
119            filter_clues: false,
120            ignore_author_patterns: Vec::new(),
121            ignore_copyright_holder_patterns: Vec::new(),
122            only_findings: false,
123            mark_source: false,
124            classify: false,
125            summary: false,
126            license_clarity_score: false,
127            license_references: false,
128            license_url_template: DEFAULT_LICENSEDB_URL_TEMPLATE.to_string(),
129            license_policy: None,
130            tallies: false,
131            tallies_key_files: false,
132            tallies_with_details: false,
133            facets: Vec::new(),
134            tallies_by_facet: false,
135            strip_root: false,
136            full_root: false,
137            header_options: JsonMap::new(),
138        }
139    }
140}
141
142/// Scan a single native filesystem input through the supported high-level workflow facade.
143///
144/// ```
145/// use provenant::workflow::{scan_path, ScanOptions};
146/// use std::fs;
147/// use std::time::{SystemTime, UNIX_EPOCH};
148///
149/// let unique = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos();
150/// let root = std::env::temp_dir().join(format!("provenant-workflow-docs-{unique}"));
151/// fs::create_dir_all(&root)?;
152/// fs::write(root.join("README.txt"), "hello from doctest\n")?;
153///
154/// let output = scan_path(&root, &ScanOptions::default())?;
155/// assert!(output.files.iter().any(|file| file.path.ends_with("README.txt")));
156/// assert_eq!(output.headers.len(), 1);
157/// assert!(!output.headers[0].options.contains_key("input"));
158///
159/// fs::remove_dir_all(&root)?;
160/// # Ok::<(), Box<dyn std::error::Error>>(())
161/// ```
162pub fn scan_path(path: impl AsRef<Path>, options: &ScanOptions) -> Result<Output> {
163    scan_paths([path.as_ref()], options)
164}
165
166/// Scan multiple native filesystem inputs in one in-process workflow run.
167///
168/// Absolute paths are supported as long as they can be resolved through a shared scan root by the
169/// internal pipeline.
170///
171/// ```
172/// use provenant::workflow::{scan_paths, ScanOptions};
173/// use std::fs;
174/// use std::time::{SystemTime, UNIX_EPOCH};
175///
176/// let unique = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos();
177/// let root = std::env::temp_dir().join(format!("provenant-workflow-docs-{unique}"));
178/// let left = root.join("left");
179/// let right = root.join("right");
180/// fs::create_dir_all(&left)?;
181/// fs::create_dir_all(&right)?;
182/// fs::write(left.join("one.txt"), "left\n")?;
183/// fs::write(right.join("two.txt"), "right\n")?;
184///
185/// let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())?;
186/// let paths: Vec<_> = output.files.iter().map(|file| file.path.as_str()).collect();
187/// assert!(paths.iter().any(|path| path.ends_with("one.txt")));
188/// assert!(paths.iter().any(|path| path.ends_with("two.txt")));
189///
190/// fs::remove_dir_all(&root)?;
191/// # Ok::<(), Box<dyn std::error::Error>>(())
192/// ```
193pub fn scan_paths<'a>(
194    paths: impl IntoIterator<Item = &'a Path>,
195    options: &ScanOptions,
196) -> Result<Output> {
197    let input_paths: Vec<String> = paths
198        .into_iter()
199        .map(|path| path.to_string_lossy().to_string())
200        .collect();
201
202    if input_paths.is_empty() {
203        return Err(anyhow!("At least one input path is required"));
204    }
205
206    let request = request_for_native_paths(input_paths, options);
207    validate_workflow_request(&request)?;
208
209    execute_request(&request).map(|executed| executed.output)
210}
211
212fn request_for_native_paths(input_paths: Vec<String>, options: &ScanOptions) -> ScanRequest {
213    let mut header_options = options.header_options.clone();
214    if options.include_input_header {
215        header_options.insert(
216            "input".to_string(),
217            JsonValue::Array(input_paths.iter().cloned().map(JsonValue::String).collect()),
218        );
219    }
220
221    let (license, license_dataset_path) = match &options.detect_license {
222        LicenseSource::Disabled => (false, None),
223        LicenseSource::Embedded => (true, None),
224        LicenseSource::Directory(path) => (true, Some(path.to_string_lossy().to_string())),
225    };
226
227    ScanRequest {
228        input_paths,
229        input_mode: InputMode::Native,
230        output_targets: Vec::new(),
231        output_header_options: header_options,
232        progress_mode: options.progress_mode,
233        process_mode: options.process_mode,
234        timeout_seconds: options.timeout_seconds,
235        quiet: matches!(options.progress_mode, ProgressMode::Quiet),
236        verbose: matches!(options.progress_mode, ProgressMode::Verbose),
237        strip_root: options.strip_root,
238        full_root: options.full_root,
239        include: options.include.clone(),
240        exclude: options.exclude.clone(),
241        paths_files: Vec::new(),
242        respect_process_cache_env: false,
243        cache_dir: options
244            .cache_dir
245            .as_ref()
246            .map(|path| path.to_string_lossy().to_string()),
247        cache_clear: options.cache_clear,
248        incremental: options.incremental,
249        max_depth: options.max_depth,
250        max_in_memory: options.max_in_memory,
251        info: options.collect_info,
252        package: options.detect_packages,
253        system_package: options.detect_system_packages,
254        package_in_compiled: options.detect_packages_in_compiled,
255        package_only: options.package_only,
256        no_assemble: options.no_assemble,
257        license_dataset_path,
258        reindex: options.reindex,
259        no_license_index_cache: options.no_license_index_cache,
260        license_text: options.license_text,
261        license_text_diagnostics: options.license_text_diagnostics,
262        license_diagnostics: options.license_diagnostics,
263        unknown_licenses: options.unknown_licenses,
264        license_score: options.license_score,
265        license_url_template: options.license_url_template.clone(),
266        filter_clues: options.filter_clues,
267        ignore_author: options.ignore_author_patterns.clone(),
268        ignore_copyright_holder: options.ignore_copyright_holder_patterns.clone(),
269        only_findings: options.only_findings,
270        mark_source: options.mark_source,
271        classify: options.classify,
272        summary: options.summary,
273        license_clarity_score: options.license_clarity_score,
274        license_references: options.license_references,
275        license_policy: options
276            .license_policy
277            .as_ref()
278            .map(|path| path.to_string_lossy().to_string()),
279        tallies: options.tallies,
280        tallies_key_files: options.tallies_key_files,
281        tallies_with_details: options.tallies_with_details,
282        facet: options.facets.clone(),
283        tallies_by_facet: options.tallies_by_facet,
284        generated: options.detect_generated,
285        license,
286        copyright: options.detect_copyrights,
287        email: options.detect_emails,
288        max_email: options.max_emails,
289        url: options.detect_urls,
290        max_url: options.max_urls,
291    }
292}
293
294fn validate_workflow_request(request: &ScanRequest) -> Result<()> {
295    let license_enabled = request.license;
296
297    if request.strip_root && request.full_root {
298        return Err(anyhow!("strip_root and full_root are mutually exclusive"));
299    }
300
301    if request.license_text && !license_enabled {
302        return Err(anyhow!("license_text requires detect_license"));
303    }
304
305    if request.license_text_diagnostics && !request.license_text {
306        return Err(anyhow!("license_text_diagnostics requires license_text"));
307    }
308
309    if request.license_diagnostics && !license_enabled {
310        return Err(anyhow!("license_diagnostics requires detect_license"));
311    }
312
313    if request.unknown_licenses && !license_enabled {
314        return Err(anyhow!("unknown_licenses requires detect_license"));
315    }
316
317    if request.license_references && !license_enabled {
318        return Err(anyhow!("license_references requires detect_license"));
319    }
320
321    if request.license_url_template != DEFAULT_LICENSEDB_URL_TEMPLATE && !license_enabled {
322        return Err(anyhow!("license_url_template requires detect_license"));
323    }
324
325    if request.package_only && license_enabled {
326        return Err(anyhow!(
327            "package_only cannot be combined with detect_license"
328        ));
329    }
330
331    if request.package_only && request.summary {
332        return Err(anyhow!("package_only cannot be combined with summary"));
333    }
334
335    if request.package_only && request.package {
336        return Err(anyhow!(
337            "package_only cannot be combined with detect_packages"
338        ));
339    }
340
341    if request.package_only && request.system_package {
342        return Err(anyhow!(
343            "package_only cannot be combined with detect_system_packages"
344        ));
345    }
346
347    if request.summary && !request.classify {
348        return Err(anyhow!("summary requires classify"));
349    }
350
351    if request.license_clarity_score && !request.classify {
352        return Err(anyhow!("license_clarity_score requires classify"));
353    }
354
355    if request.tallies_key_files && !(request.tallies && request.classify) {
356        return Err(anyhow!("tallies_key_files requires tallies and classify"));
357    }
358
359    if request.tallies_by_facet && request.facet.is_empty() {
360        return Err(anyhow!(
361            "tallies_by_facet requires at least one facet definition"
362        ));
363    }
364
365    if request.tallies_by_facet && !request.tallies {
366        return Err(anyhow!("tallies_by_facet requires tallies"));
367    }
368
369    if request.mark_source && !request.info {
370        return Err(anyhow!("mark_source requires collect_info"));
371    }
372
373    if request.license_score > 100 {
374        return Err(anyhow!("license_score must be between 0 and 100"));
375    }
376
377    Ok(())
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383    use std::fs;
384
385    #[test]
386    fn scan_path_requires_at_least_one_input() {
387        let result = scan_paths(std::iter::empty::<&Path>(), &ScanOptions::default());
388        assert!(result.is_err());
389    }
390
391    #[test]
392    fn workflow_request_populates_input_header() {
393        let options = ScanOptions {
394            include_input_header: true,
395            ..ScanOptions::default()
396        };
397        let request = request_for_native_paths(vec!["src".to_string()], &options);
398        assert!(request.output_header_options.contains_key("input"));
399    }
400
401    #[test]
402    fn workflow_validation_rejects_license_dependent_flags_without_license() {
403        let options = ScanOptions {
404            license_references: true,
405            ..ScanOptions::default()
406        };
407
408        let request = request_for_native_paths(vec!["src".to_string()], &options);
409        let error = validate_workflow_request(&request).expect_err("validation should fail");
410        assert!(
411            error
412                .to_string()
413                .contains("license_references requires detect_license")
414        );
415    }
416
417    #[test]
418    fn workflow_validation_rejects_package_only_with_regular_package_modes() {
419        let options = ScanOptions {
420            package_only: true,
421            detect_packages: true,
422            ..ScanOptions::default()
423        };
424
425        let request = request_for_native_paths(vec!["src".to_string()], &options);
426        let error = validate_workflow_request(&request).expect_err("validation should fail");
427        assert!(
428            error
429                .to_string()
430                .contains("package_only cannot be combined with detect_packages")
431        );
432    }
433
434    #[test]
435    fn workflow_validation_rejects_classify_dependent_flags_without_classify() {
436        let options = ScanOptions {
437            summary: true,
438            ..ScanOptions::default()
439        };
440
441        let request = request_for_native_paths(vec!["src".to_string()], &options);
442        let error = validate_workflow_request(&request).expect_err("validation should fail");
443        assert!(error.to_string().contains("summary requires classify"));
444    }
445
446    #[test]
447    fn scan_path_runs_a_basic_in_process_scan() {
448        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
449        fs::write(
450            temp_dir.path().join("README.txt"),
451            "hello from workflow facade\n",
452        )
453        .expect("write fixture file");
454
455        let options = ScanOptions {
456            collect_info: true,
457            include_input_header: true,
458            ..ScanOptions::default()
459        };
460
461        let output = scan_path(temp_dir.path(), &options).expect("workflow scan should succeed");
462
463        assert_eq!(output.headers.len(), 1);
464        assert!(!output.files.is_empty());
465        assert!(output.headers[0].options.contains_key("input"));
466    }
467
468    #[test]
469    fn scan_paths_supports_multiple_absolute_inputs() {
470        let temp_dir = tempfile::TempDir::new().expect("create temp dir");
471        let left = temp_dir.path().join("left");
472        let right = temp_dir.path().join("right");
473        fs::create_dir_all(&left).expect("create left dir");
474        fs::create_dir_all(&right).expect("create right dir");
475        fs::write(left.join("one.txt"), "left\n").expect("write left fixture");
476        fs::write(right.join("two.txt"), "right\n").expect("write right fixture");
477
478        let output = scan_paths([left.as_path(), right.as_path()], &ScanOptions::default())
479            .expect("workflow scan should succeed for multiple absolute inputs");
480
481        assert!(
482            output
483                .files
484                .iter()
485                .any(|file| file.path.ends_with("one.txt"))
486        );
487        assert!(
488            output
489                .files
490                .iter()
491                .any(|file| file.path.ends_with("two.txt"))
492        );
493    }
494}