Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use std::path::PathBuf;
5
6use crate::models::FileInfo;
7
8pub struct ProcessResult {
9    pub files: Vec<FileInfo>,
10    pub excluded_count: usize,
11}
12
13#[derive(Debug, Clone, Copy, Default)]
14pub struct LicenseScanOptions {
15    pub include_text: bool,
16    pub include_text_diagnostics: bool,
17    pub include_diagnostics: bool,
18    pub unknown_licenses: bool,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23    pub collect_info: bool,
24    pub detect_packages: bool,
25    pub detect_copyrights: bool,
26    pub detect_generated: bool,
27    pub detect_emails: bool,
28    pub detect_urls: bool,
29    pub max_emails: usize,
30    pub max_urls: usize,
31    pub timeout_seconds: f64,
32    pub scan_cache_dir: Option<PathBuf>,
33}
34
35impl Default for TextDetectionOptions {
36    fn default() -> Self {
37        Self {
38            collect_info: false,
39            detect_packages: false,
40            detect_copyrights: true,
41            detect_generated: false,
42            detect_emails: false,
43            detect_urls: false,
44            max_emails: 50,
45            max_urls: 50,
46            timeout_seconds: 120.0,
47            scan_cache_dir: None,
48        }
49    }
50}
51
52#[allow(unused_imports)]
53pub use self::collect::{CollectedPaths, collect_paths};
54pub use self::process::process_collected;
55
56#[cfg(test)]
57mod tests {
58    use std::fs;
59    use std::sync::Arc;
60
61    use tempfile::TempDir;
62
63    use crate::models::FileType;
64    use crate::progress::{ProgressMode, ScanProgress};
65
66    use super::{LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected};
67
68    #[test]
69    fn default_options_keep_copyright_detection_enabled() {
70        let options = TextDetectionOptions::default();
71        assert!(!options.detect_packages);
72        assert!(options.detect_copyrights);
73    }
74
75    fn scan_single_file(
76        file_name: &str,
77        content: &str,
78        options: &TextDetectionOptions,
79    ) -> crate::models::FileInfo {
80        let temp_dir = TempDir::new().expect("create temp dir");
81        let file_path = temp_dir.path().join(file_name);
82        fs::write(&file_path, content).expect("write test file");
83
84        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
85        let collected = collect_paths(temp_dir.path(), 0, &[]);
86        let result = process_collected(
87            &collected,
88            progress,
89            None,
90            LicenseScanOptions::default(),
91            options,
92        );
93
94        result
95            .files
96            .into_iter()
97            .find(|entry| {
98                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
99            })
100            .expect("scanned file entry")
101    }
102
103    #[test]
104    fn scanner_reports_repeated_email_occurrences() {
105        let options = TextDetectionOptions {
106            collect_info: false,
107            detect_packages: false,
108            detect_copyrights: false,
109            detect_generated: false,
110            detect_emails: true,
111            detect_urls: false,
112            max_emails: 50,
113            max_urls: 50,
114            timeout_seconds: 120.0,
115            scan_cache_dir: None,
116        };
117        let scanned = scan_single_file(
118            "contacts.txt",
119            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
120            &options,
121        );
122
123        let emails: Vec<(&str, usize)> = scanned
124            .emails
125            .iter()
126            .map(|email| (email.email.as_str(), email.start_line))
127            .collect();
128
129        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
130        assert_eq!(
131            emails,
132            vec![
133                ("linux@3ware.com", 1),
134                ("linux@3ware.com", 2),
135                ("andre@suse.com", 3),
136                ("linux@3ware.com", 4),
137            ]
138        );
139    }
140
141    #[test]
142    fn scanner_skips_pem_certificate_text_detection() {
143        let options = TextDetectionOptions {
144            collect_info: false,
145            detect_packages: false,
146            detect_copyrights: true,
147            detect_generated: false,
148            detect_emails: true,
149            detect_urls: true,
150            max_emails: 50,
151            max_urls: 50,
152            timeout_seconds: 120.0,
153            scan_cache_dir: None,
154        };
155        let pem_fixture = concat!(
156            "-----BEGIN CERTIFICATE-----\n",
157            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
158            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
159            "-----END CERTIFICATE-----\n",
160            "Certificate:\n",
161            "    Data:\n",
162            "        Signature Algorithm: sha1WithRSAEncryption\n",
163            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
164            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
165            "        Contact: cert-owner@example.com\n",
166        );
167        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
168
169        assert!(
170            scanned.copyrights.is_empty(),
171            "copyrights: {:#?}",
172            scanned.copyrights
173        );
174        assert!(
175            scanned.holders.is_empty(),
176            "holders: {:#?}",
177            scanned.holders
178        );
179        assert!(
180            scanned.authors.is_empty(),
181            "authors: {:#?}",
182            scanned.authors
183        );
184        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
185        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
186        assert!(
187            scanned.license_detections.is_empty(),
188            "licenses: {:#?}",
189            scanned.license_detections
190        );
191        assert!(
192            scanned.license_clues.is_empty(),
193            "license clues: {:#?}",
194            scanned.license_clues
195        );
196    }
197
198    #[test]
199    fn scanner_detects_structured_credits_authors() {
200        let options = TextDetectionOptions {
201            collect_info: false,
202            detect_packages: false,
203            detect_copyrights: true,
204            detect_generated: false,
205            detect_emails: false,
206            detect_urls: false,
207            max_emails: 50,
208            max_urls: 50,
209            timeout_seconds: 120.0,
210            scan_cache_dir: None,
211        };
212        let credits_fixture = concat!(
213            "N: Jack Lloyd\n",
214            "E: lloyd@randombit.net\n",
215            "W: http://www.randombit.net/\n",
216        );
217        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
218
219        let authors: Vec<(&str, usize, usize)> = scanned
220            .authors
221            .iter()
222            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
223            .collect();
224
225        assert_eq!(
226            authors,
227            vec![(
228                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
229                1,
230                3,
231            )]
232        );
233        assert!(scanned.copyrights.is_empty());
234        assert!(scanned.holders.is_empty());
235    }
236
237    #[test]
238    fn scanner_sets_generated_flag_when_enabled() {
239        let options = TextDetectionOptions {
240            collect_info: false,
241            detect_packages: false,
242            detect_copyrights: false,
243            detect_generated: true,
244            detect_emails: false,
245            detect_urls: false,
246            max_emails: 50,
247            max_urls: 50,
248            timeout_seconds: 120.0,
249            scan_cache_dir: None,
250        };
251        let scanned = scan_single_file(
252            "generated.c",
253            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
254            &options,
255        );
256
257        assert_eq!(scanned.is_generated, Some(true));
258    }
259
260    #[test]
261    fn scanner_leaves_generated_flag_unset_when_disabled() {
262        let options = TextDetectionOptions {
263            collect_info: false,
264            detect_packages: false,
265            detect_copyrights: false,
266            detect_generated: false,
267            detect_emails: false,
268            detect_urls: false,
269            max_emails: 50,
270            max_urls: 50,
271            timeout_seconds: 120.0,
272            scan_cache_dir: None,
273        };
274        let scanned = scan_single_file(
275            "generated.c",
276            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
277            &options,
278        );
279
280        assert_eq!(scanned.is_generated, None);
281    }
282
283    #[test]
284    fn scanner_skips_package_parsing_when_disabled() {
285        let options = TextDetectionOptions {
286            collect_info: false,
287            detect_packages: false,
288            detect_copyrights: false,
289            detect_generated: false,
290            detect_emails: false,
291            detect_urls: false,
292            max_emails: 50,
293            max_urls: 50,
294            timeout_seconds: 120.0,
295            scan_cache_dir: None,
296        };
297        let scanned = scan_single_file(
298            "package.json",
299            r#"{"name":"demo","version":"1.0.0"}"#,
300            &options,
301        );
302
303        assert!(
304            scanned.package_data.is_empty(),
305            "package_data: {:#?}",
306            scanned.package_data
307        );
308    }
309
310    #[test]
311    fn scanner_parses_package_manifests_when_enabled() {
312        let options = TextDetectionOptions {
313            collect_info: false,
314            detect_packages: true,
315            detect_copyrights: false,
316            detect_generated: false,
317            detect_emails: false,
318            detect_urls: false,
319            max_emails: 50,
320            max_urls: 50,
321            timeout_seconds: 120.0,
322            scan_cache_dir: None,
323        };
324        let scanned = scan_single_file(
325            "package.json",
326            r#"{"name":"demo","version":"1.0.0"}"#,
327            &options,
328        );
329
330        assert_eq!(
331            scanned.package_data.len(),
332            1,
333            "package_data: {:#?}",
334            scanned.package_data
335        );
336    }
337
338    #[test]
339    fn scanner_sets_is_source_only_when_info_enabled() {
340        let without_info = TextDetectionOptions {
341            collect_info: false,
342            detect_packages: false,
343            detect_copyrights: false,
344            detect_generated: false,
345            detect_emails: false,
346            detect_urls: false,
347            max_emails: 50,
348            max_urls: 50,
349            timeout_seconds: 120.0,
350            scan_cache_dir: None,
351        };
352        let with_info = TextDetectionOptions {
353            collect_info: true,
354            ..without_info.clone()
355        };
356
357        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
358        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
359
360        assert_eq!(scanned_without_info.is_source, None);
361        assert_eq!(scanned_with_info.is_source, Some(true));
362    }
363
364    #[test]
365    fn collect_paths_includes_root_directory_entry() {
366        let temp_dir = TempDir::new().expect("create temp dir");
367        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
368        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
369            .expect("write nested file");
370
371        let collected = collect_paths(temp_dir.path(), 0, &[]);
372
373        assert!(
374            collected
375                .directories
376                .iter()
377                .any(|(path, _)| path == temp_dir.path())
378        );
379    }
380
381    #[test]
382    fn collect_paths_supports_single_file_input() {
383        let temp_dir = TempDir::new().expect("create temp dir");
384        let file_path = temp_dir.path().join("main.rs");
385        fs::write(&file_path, "fn main() {}\n").expect("write file");
386
387        let collected = collect_paths(&file_path, 0, &[]);
388
389        assert_eq!(collected.files.len(), 1);
390        assert!(collected.directories.is_empty());
391        assert_eq!(collected.files[0].0, file_path);
392    }
393}