Skip to main content

provenant/scanner/
mod.rs

1mod count;
2mod process;
3
4use std::path::PathBuf;
5
6use crate::models::FileInfo;
7
8/// Aggregated result of scanning a directory tree.
9///
10/// Includes discovered file entries and the count of paths skipped by
11/// exclusion patterns.
12pub struct ProcessResult {
13    /// File and directory entries produced by the scan.
14    pub files: Vec<FileInfo>,
15    /// Number of excluded paths encountered during traversal.
16    pub excluded_count: usize,
17}
18
19#[derive(Debug, Clone)]
20pub struct TextDetectionOptions {
21    pub detect_copyrights: bool,
22    pub detect_emails: bool,
23    pub detect_urls: bool,
24    pub max_emails: usize,
25    pub max_urls: usize,
26    pub timeout_seconds: f64,
27    pub scan_cache_dir: Option<PathBuf>,
28}
29
30impl Default for TextDetectionOptions {
31    fn default() -> Self {
32        Self {
33            detect_copyrights: true,
34            detect_emails: false,
35            detect_urls: false,
36            max_emails: 50,
37            max_urls: 50,
38            timeout_seconds: 120.0,
39            scan_cache_dir: None,
40        }
41    }
42}
43
44pub use self::count::count_with_size;
45pub use self::process::{process, process_with_options};
46
47#[cfg(test)]
48mod tests {
49    use std::fs;
50    use std::sync::Arc;
51
52    use tempfile::TempDir;
53
54    use crate::askalono::{ScanStrategy, Store};
55    use crate::models::FileType;
56    use crate::progress::{ProgressMode, ScanProgress};
57
58    use super::TextDetectionOptions;
59    use super::process_with_options;
60
61    #[test]
62    fn default_options_keep_copyright_detection_enabled() {
63        let options = TextDetectionOptions::default();
64        assert!(options.detect_copyrights);
65    }
66
67    fn scan_strategy_without_licenses() -> ScanStrategy<'static> {
68        let store = Box::leak(Box::new(Store::new()));
69        ScanStrategy::new(store)
70    }
71
72    fn scan_single_file(
73        file_name: &str,
74        content: &str,
75        options: &TextDetectionOptions,
76    ) -> crate::models::FileInfo {
77        let temp_dir = TempDir::new().expect("create temp dir");
78        let file_path = temp_dir.path().join(file_name);
79        fs::write(&file_path, content).expect("write test file");
80
81        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
82        let strategy = scan_strategy_without_licenses();
83        let result = process_with_options(temp_dir.path(), 0, progress, &[], &strategy, options)
84            .expect("scan should succeed");
85
86        result
87            .files
88            .into_iter()
89            .find(|entry| {
90                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
91            })
92            .expect("scanned file entry")
93    }
94
95    #[test]
96    fn scanner_reports_repeated_email_occurrences() {
97        let options = TextDetectionOptions {
98            detect_copyrights: false,
99            detect_emails: true,
100            detect_urls: false,
101            max_emails: 50,
102            max_urls: 50,
103            timeout_seconds: 120.0,
104            scan_cache_dir: None,
105        };
106        let scanned = scan_single_file(
107            "contacts.txt",
108            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
109            &options,
110        );
111
112        let emails: Vec<(&str, usize)> = scanned
113            .emails
114            .iter()
115            .map(|email| (email.email.as_str(), email.start_line))
116            .collect();
117
118        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
119        assert_eq!(
120            emails,
121            vec![
122                ("linux@3ware.com", 1),
123                ("linux@3ware.com", 2),
124                ("andre@suse.com", 3),
125                ("linux@3ware.com", 4),
126            ]
127        );
128    }
129
130    #[test]
131    fn scanner_skips_pem_certificate_text_detection() {
132        let options = TextDetectionOptions {
133            detect_copyrights: true,
134            detect_emails: true,
135            detect_urls: true,
136            max_emails: 50,
137            max_urls: 50,
138            timeout_seconds: 120.0,
139            scan_cache_dir: None,
140        };
141        let pem_fixture = concat!(
142            "-----BEGIN CERTIFICATE-----\n",
143            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
144            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
145            "-----END CERTIFICATE-----\n",
146            "Certificate:\n",
147            "    Data:\n",
148            "        Signature Algorithm: sha1WithRSAEncryption\n",
149            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
150            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
151            "        Contact: cert-owner@example.com\n",
152        );
153        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
154
155        assert!(
156            scanned.copyrights.is_empty(),
157            "copyrights: {:#?}",
158            scanned.copyrights
159        );
160        assert!(
161            scanned.holders.is_empty(),
162            "holders: {:#?}",
163            scanned.holders
164        );
165        assert!(
166            scanned.authors.is_empty(),
167            "authors: {:#?}",
168            scanned.authors
169        );
170        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
171        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
172        assert!(
173            scanned.license_detections.is_empty(),
174            "licenses: {:#?}",
175            scanned.license_detections
176        );
177    }
178
179    #[test]
180    fn scanner_detects_structured_credits_authors() {
181        let options = TextDetectionOptions {
182            detect_copyrights: true,
183            detect_emails: false,
184            detect_urls: false,
185            max_emails: 50,
186            max_urls: 50,
187            timeout_seconds: 120.0,
188            scan_cache_dir: None,
189        };
190        let credits_fixture = concat!(
191            "N: Jack Lloyd\n",
192            "E: lloyd@randombit.net\n",
193            "W: http://www.randombit.net/\n",
194        );
195        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
196
197        let authors: Vec<(&str, usize, usize)> = scanned
198            .authors
199            .iter()
200            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
201            .collect();
202
203        assert_eq!(
204            authors,
205            vec![(
206                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
207                1,
208                3,
209            )]
210        );
211        assert!(scanned.copyrights.is_empty());
212        assert!(scanned.holders.is_empty());
213    }
214}