Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use std::path::PathBuf;
5
6use crate::models::FileInfo;
7
8pub struct ProcessResult {
9    pub files: Vec<FileInfo>,
10    pub excluded_count: usize,
11}
12
13#[derive(Debug, Clone)]
14pub struct TextDetectionOptions {
15    pub detect_packages: bool,
16    pub detect_copyrights: bool,
17    pub detect_generated: bool,
18    pub detect_emails: bool,
19    pub detect_urls: bool,
20    pub max_emails: usize,
21    pub max_urls: usize,
22    pub timeout_seconds: f64,
23    pub scan_cache_dir: Option<PathBuf>,
24}
25
26impl Default for TextDetectionOptions {
27    fn default() -> Self {
28        Self {
29            detect_packages: false,
30            detect_copyrights: true,
31            detect_generated: false,
32            detect_emails: false,
33            detect_urls: false,
34            max_emails: 50,
35            max_urls: 50,
36            timeout_seconds: 120.0,
37            scan_cache_dir: None,
38        }
39    }
40}
41
42#[allow(unused_imports)]
43pub use self::collect::{CollectedPaths, collect_paths};
44pub use self::process::process_collected;
45
46#[cfg(test)]
47mod tests {
48    use std::fs;
49    use std::sync::Arc;
50
51    use tempfile::TempDir;
52
53    use crate::models::FileType;
54    use crate::progress::{ProgressMode, ScanProgress};
55
56    use super::{TextDetectionOptions, collect_paths, process_collected};
57
58    #[test]
59    fn default_options_keep_copyright_detection_enabled() {
60        let options = TextDetectionOptions::default();
61        assert!(!options.detect_packages);
62        assert!(options.detect_copyrights);
63    }
64
65    fn scan_single_file(
66        file_name: &str,
67        content: &str,
68        options: &TextDetectionOptions,
69    ) -> crate::models::FileInfo {
70        let temp_dir = TempDir::new().expect("create temp dir");
71        let file_path = temp_dir.path().join(file_name);
72        fs::write(&file_path, content).expect("write test file");
73
74        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
75        let collected = collect_paths(temp_dir.path(), 0, &[]);
76        let result = process_collected(&collected, progress, None, false, options);
77
78        result
79            .files
80            .into_iter()
81            .find(|entry| {
82                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
83            })
84            .expect("scanned file entry")
85    }
86
87    #[test]
88    fn scanner_reports_repeated_email_occurrences() {
89        let options = TextDetectionOptions {
90            detect_packages: false,
91            detect_copyrights: false,
92            detect_generated: false,
93            detect_emails: true,
94            detect_urls: false,
95            max_emails: 50,
96            max_urls: 50,
97            timeout_seconds: 120.0,
98            scan_cache_dir: None,
99        };
100        let scanned = scan_single_file(
101            "contacts.txt",
102            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
103            &options,
104        );
105
106        let emails: Vec<(&str, usize)> = scanned
107            .emails
108            .iter()
109            .map(|email| (email.email.as_str(), email.start_line))
110            .collect();
111
112        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
113        assert_eq!(
114            emails,
115            vec![
116                ("linux@3ware.com", 1),
117                ("linux@3ware.com", 2),
118                ("andre@suse.com", 3),
119                ("linux@3ware.com", 4),
120            ]
121        );
122    }
123
124    #[test]
125    fn scanner_skips_pem_certificate_text_detection() {
126        let options = TextDetectionOptions {
127            detect_packages: false,
128            detect_copyrights: true,
129            detect_generated: false,
130            detect_emails: true,
131            detect_urls: true,
132            max_emails: 50,
133            max_urls: 50,
134            timeout_seconds: 120.0,
135            scan_cache_dir: None,
136        };
137        let pem_fixture = concat!(
138            "-----BEGIN CERTIFICATE-----\n",
139            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
140            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
141            "-----END CERTIFICATE-----\n",
142            "Certificate:\n",
143            "    Data:\n",
144            "        Signature Algorithm: sha1WithRSAEncryption\n",
145            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
146            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
147            "        Contact: cert-owner@example.com\n",
148        );
149        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
150
151        assert!(
152            scanned.copyrights.is_empty(),
153            "copyrights: {:#?}",
154            scanned.copyrights
155        );
156        assert!(
157            scanned.holders.is_empty(),
158            "holders: {:#?}",
159            scanned.holders
160        );
161        assert!(
162            scanned.authors.is_empty(),
163            "authors: {:#?}",
164            scanned.authors
165        );
166        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
167        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
168        assert!(
169            scanned.license_detections.is_empty(),
170            "licenses: {:#?}",
171            scanned.license_detections
172        );
173    }
174
175    #[test]
176    fn scanner_detects_structured_credits_authors() {
177        let options = TextDetectionOptions {
178            detect_packages: false,
179            detect_copyrights: true,
180            detect_generated: false,
181            detect_emails: false,
182            detect_urls: false,
183            max_emails: 50,
184            max_urls: 50,
185            timeout_seconds: 120.0,
186            scan_cache_dir: None,
187        };
188        let credits_fixture = concat!(
189            "N: Jack Lloyd\n",
190            "E: lloyd@randombit.net\n",
191            "W: http://www.randombit.net/\n",
192        );
193        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
194
195        let authors: Vec<(&str, usize, usize)> = scanned
196            .authors
197            .iter()
198            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
199            .collect();
200
201        assert_eq!(
202            authors,
203            vec![(
204                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
205                1,
206                3,
207            )]
208        );
209        assert!(scanned.copyrights.is_empty());
210        assert!(scanned.holders.is_empty());
211    }
212
213    #[test]
214    fn scanner_sets_generated_flag_when_enabled() {
215        let options = TextDetectionOptions {
216            detect_packages: false,
217            detect_copyrights: false,
218            detect_generated: true,
219            detect_emails: false,
220            detect_urls: false,
221            max_emails: 50,
222            max_urls: 50,
223            timeout_seconds: 120.0,
224            scan_cache_dir: None,
225        };
226        let scanned = scan_single_file(
227            "generated.c",
228            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
229            &options,
230        );
231
232        assert_eq!(scanned.is_generated, Some(true));
233    }
234
235    #[test]
236    fn scanner_skips_package_parsing_when_disabled() {
237        let options = TextDetectionOptions {
238            detect_packages: false,
239            detect_copyrights: false,
240            detect_generated: false,
241            detect_emails: false,
242            detect_urls: false,
243            max_emails: 50,
244            max_urls: 50,
245            timeout_seconds: 120.0,
246            scan_cache_dir: None,
247        };
248        let scanned = scan_single_file(
249            "package.json",
250            r#"{"name":"demo","version":"1.0.0"}"#,
251            &options,
252        );
253
254        assert!(
255            scanned.package_data.is_empty(),
256            "package_data: {:#?}",
257            scanned.package_data
258        );
259    }
260
261    #[test]
262    fn scanner_parses_package_manifests_when_enabled() {
263        let options = TextDetectionOptions {
264            detect_packages: true,
265            detect_copyrights: false,
266            detect_generated: false,
267            detect_emails: false,
268            detect_urls: false,
269            max_emails: 50,
270            max_urls: 50,
271            timeout_seconds: 120.0,
272            scan_cache_dir: None,
273        };
274        let scanned = scan_single_file(
275            "package.json",
276            r#"{"name":"demo","version":"1.0.0"}"#,
277            &options,
278        );
279
280        assert_eq!(
281            scanned.package_data.len(),
282            1,
283            "package_data: {:#?}",
284            scanned.package_data
285        );
286    }
287}