Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use std::path::PathBuf;
5
6use crate::models::FileInfo;
7
8pub struct ProcessResult {
9    pub files: Vec<FileInfo>,
10    pub excluded_count: usize,
11}
12
13#[derive(Debug, Clone, Copy, Default)]
14pub struct LicenseScanOptions {
15    pub include_text: bool,
16    pub include_text_diagnostics: bool,
17    pub include_diagnostics: bool,
18    pub unknown_licenses: bool,
19    pub min_score: u8,
20}
21
22#[derive(Debug, Clone)]
23pub struct TextDetectionOptions {
24    pub collect_info: bool,
25    pub detect_packages: bool,
26    pub detect_application_packages: bool,
27    pub detect_system_packages: bool,
28    pub detect_packages_in_compiled: bool,
29    pub detect_copyrights: bool,
30    pub detect_generated: bool,
31    pub detect_emails: bool,
32    pub detect_urls: bool,
33    pub max_emails: usize,
34    pub max_urls: usize,
35    pub timeout_seconds: f64,
36    pub scan_cache_dir: Option<PathBuf>,
37}
38
39impl Default for TextDetectionOptions {
40    fn default() -> Self {
41        Self {
42            collect_info: false,
43            detect_packages: false,
44            detect_application_packages: false,
45            detect_system_packages: false,
46            detect_packages_in_compiled: false,
47            detect_copyrights: true,
48            detect_generated: false,
49            detect_emails: false,
50            detect_urls: false,
51            max_emails: 50,
52            max_urls: 50,
53            timeout_seconds: 120.0,
54            scan_cache_dir: None,
55        }
56    }
57}
58
59pub use self::collect::{CollectedPaths, collect_paths};
60#[allow(unused_imports)]
61pub use self::process::{process_collected, process_collected_with_memory_limit};
62
63#[cfg(test)]
64mod tests {
65    use std::fs;
66    use std::sync::Arc;
67
68    use tempfile::TempDir;
69
70    use crate::models::FileType;
71    use crate::progress::{ProgressMode, ScanProgress};
72
73    use super::{
74        LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected,
75        process_collected_with_memory_limit,
76    };
77
78    #[test]
79    fn default_options_keep_copyright_detection_enabled() {
80        let options = TextDetectionOptions::default();
81        assert!(!options.detect_packages);
82        assert!(options.detect_copyrights);
83    }
84
85    fn scan_single_file(
86        file_name: &str,
87        content: &str,
88        options: &TextDetectionOptions,
89    ) -> crate::models::FileInfo {
90        let temp_dir = TempDir::new().expect("create temp dir");
91        let file_path = temp_dir.path().join(file_name);
92        fs::write(&file_path, content).expect("write test file");
93
94        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
95        let collected = collect_paths(temp_dir.path(), 0, &[]);
96        let result = process_collected(
97            &collected,
98            progress,
99            None,
100            LicenseScanOptions::default(),
101            options,
102        );
103
104        result
105            .files
106            .into_iter()
107            .find(|entry| {
108                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
109            })
110            .expect("scanned file entry")
111    }
112
113    fn scan_file_at_relative_path(
114        relative_path: &str,
115        content: &[u8],
116        options: &TextDetectionOptions,
117    ) -> crate::models::FileInfo {
118        let temp_dir = TempDir::new().expect("create temp dir");
119        let file_path = temp_dir.path().join(relative_path);
120        if let Some(parent) = file_path.parent() {
121            fs::create_dir_all(parent).expect("create parent dirs");
122        }
123        fs::write(&file_path, content).expect("write test file");
124
125        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
126        let collected = collect_paths(temp_dir.path(), 0, &[]);
127        let result = process_collected(
128            &collected,
129            progress,
130            None,
131            LicenseScanOptions::default(),
132            options,
133        );
134
135        result
136            .files
137            .into_iter()
138            .find(|entry| {
139                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
140            })
141            .expect("scanned file entry")
142    }
143
144    #[test]
145    fn scanner_reports_repeated_email_occurrences() {
146        let options = TextDetectionOptions {
147            collect_info: false,
148            detect_packages: false,
149            detect_application_packages: false,
150            detect_system_packages: false,
151            detect_packages_in_compiled: false,
152            detect_copyrights: false,
153            detect_generated: false,
154            detect_emails: true,
155            detect_urls: false,
156            max_emails: 50,
157            max_urls: 50,
158            timeout_seconds: 120.0,
159            scan_cache_dir: None,
160        };
161        let scanned = scan_single_file(
162            "contacts.txt",
163            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
164            &options,
165        );
166
167        let emails: Vec<(&str, usize)> = scanned
168            .emails
169            .iter()
170            .map(|email| (email.email.as_str(), email.start_line))
171            .collect();
172
173        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
174        assert_eq!(
175            emails,
176            vec![
177                ("linux@3ware.com", 1),
178                ("linux@3ware.com", 2),
179                ("andre@suse.com", 3),
180                ("linux@3ware.com", 4),
181            ]
182        );
183    }
184
185    #[test]
186    fn scanner_skips_pem_certificate_text_detection() {
187        let options = TextDetectionOptions {
188            collect_info: false,
189            detect_packages: false,
190            detect_application_packages: false,
191            detect_system_packages: false,
192            detect_packages_in_compiled: false,
193            detect_copyrights: true,
194            detect_generated: false,
195            detect_emails: true,
196            detect_urls: true,
197            max_emails: 50,
198            max_urls: 50,
199            timeout_seconds: 120.0,
200            scan_cache_dir: None,
201        };
202        let pem_fixture = concat!(
203            "-----BEGIN CERTIFICATE-----\n",
204            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
205            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
206            "-----END CERTIFICATE-----\n",
207            "Certificate:\n",
208            "    Data:\n",
209            "        Signature Algorithm: sha1WithRSAEncryption\n",
210            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
211            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
212            "        Contact: cert-owner@example.com\n",
213        );
214        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
215
216        assert!(
217            scanned.copyrights.is_empty(),
218            "copyrights: {:#?}",
219            scanned.copyrights
220        );
221        assert!(
222            scanned.holders.is_empty(),
223            "holders: {:#?}",
224            scanned.holders
225        );
226        assert!(
227            scanned.authors.is_empty(),
228            "authors: {:#?}",
229            scanned.authors
230        );
231        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
232        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
233        assert!(
234            scanned.license_detections.is_empty(),
235            "licenses: {:#?}",
236            scanned.license_detections
237        );
238        assert!(
239            scanned.license_clues.is_empty(),
240            "license clues: {:#?}",
241            scanned.license_clues
242        );
243    }
244
245    #[test]
246    fn scanner_detects_structured_credits_authors() {
247        let options = TextDetectionOptions {
248            collect_info: false,
249            detect_packages: false,
250            detect_application_packages: false,
251            detect_system_packages: false,
252            detect_packages_in_compiled: false,
253            detect_copyrights: true,
254            detect_generated: false,
255            detect_emails: false,
256            detect_urls: false,
257            max_emails: 50,
258            max_urls: 50,
259            timeout_seconds: 120.0,
260            scan_cache_dir: None,
261        };
262        let credits_fixture = concat!(
263            "N: Jack Lloyd\n",
264            "E: lloyd@randombit.net\n",
265            "W: http://www.randombit.net/\n",
266        );
267        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
268
269        let authors: Vec<(&str, usize, usize)> = scanned
270            .authors
271            .iter()
272            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
273            .collect();
274
275        assert_eq!(
276            authors,
277            vec![(
278                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
279                1,
280                3,
281            )]
282        );
283        assert!(scanned.copyrights.is_empty());
284        assert!(scanned.holders.is_empty());
285    }
286
287    #[test]
288    fn scanner_sets_generated_flag_when_enabled() {
289        let options = TextDetectionOptions {
290            collect_info: false,
291            detect_packages: false,
292            detect_application_packages: false,
293            detect_system_packages: false,
294            detect_packages_in_compiled: false,
295            detect_copyrights: false,
296            detect_generated: true,
297            detect_emails: false,
298            detect_urls: false,
299            max_emails: 50,
300            max_urls: 50,
301            timeout_seconds: 120.0,
302            scan_cache_dir: None,
303        };
304        let scanned = scan_single_file(
305            "generated.c",
306            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
307            &options,
308        );
309
310        assert_eq!(scanned.is_generated, Some(true));
311    }
312
313    #[test]
314    fn scanner_leaves_generated_flag_unset_when_disabled() {
315        let options = TextDetectionOptions {
316            collect_info: false,
317            detect_packages: false,
318            detect_application_packages: false,
319            detect_system_packages: false,
320            detect_packages_in_compiled: false,
321            detect_copyrights: false,
322            detect_generated: false,
323            detect_emails: false,
324            detect_urls: false,
325            max_emails: 50,
326            max_urls: 50,
327            timeout_seconds: 120.0,
328            scan_cache_dir: None,
329        };
330        let scanned = scan_single_file(
331            "generated.c",
332            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
333            &options,
334        );
335
336        assert_eq!(scanned.is_generated, None);
337    }
338
339    #[test]
340    fn scanner_populates_info_surface_when_enabled() {
341        let options = TextDetectionOptions {
342            collect_info: true,
343            detect_packages: false,
344            detect_application_packages: false,
345            detect_system_packages: false,
346            detect_packages_in_compiled: false,
347            detect_copyrights: false,
348            detect_generated: false,
349            detect_emails: false,
350            detect_urls: false,
351            max_emails: 50,
352            max_urls: 50,
353            timeout_seconds: 120.0,
354            scan_cache_dir: None,
355        };
356        let scanned = scan_single_file(
357            "script.py",
358            "#!/usr/bin/env python3\nprint(\"hello\")\n",
359            &options,
360        );
361
362        assert!(scanned.sha1.is_some());
363        assert!(scanned.md5.is_some());
364        assert!(scanned.sha256.is_some());
365        assert!(scanned.sha1_git.is_some());
366        assert!(scanned.mime_type.is_some());
367        assert!(scanned.date.is_some());
368        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
369        assert_eq!(scanned.is_text, Some(true));
370        assert_eq!(scanned.is_script, Some(true));
371        assert_eq!(scanned.is_source, Some(true));
372    }
373
374    #[test]
375    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
376        let options = TextDetectionOptions {
377            collect_info: true,
378            detect_packages: false,
379            detect_application_packages: false,
380            detect_system_packages: false,
381            detect_packages_in_compiled: false,
382            detect_copyrights: false,
383            detect_generated: false,
384            detect_emails: false,
385            detect_urls: false,
386            max_emails: 50,
387            max_urls: 50,
388            timeout_seconds: 120.0,
389            scan_cache_dir: None,
390        };
391        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
392        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
393
394        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
395        assert_eq!(
396            scanned.file_type_label.as_deref(),
397            Some("python script, text executable")
398        );
399        assert_eq!(scanned.is_binary, Some(false));
400        assert_eq!(scanned.is_text, Some(true));
401        assert_eq!(scanned.is_script, Some(true));
402        assert_eq!(scanned.is_source, Some(true));
403    }
404
405    #[test]
406    fn scanner_skips_findings_for_zip_like_archives() {
407        let options = TextDetectionOptions {
408            collect_info: true,
409            detect_packages: false,
410            detect_application_packages: false,
411            detect_system_packages: false,
412            detect_packages_in_compiled: false,
413            detect_copyrights: true,
414            detect_generated: false,
415            detect_emails: true,
416            detect_urls: true,
417            max_emails: 50,
418            max_urls: 50,
419            timeout_seconds: 120.0,
420            scan_cache_dir: None,
421        };
422        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
423        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
424
425        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
426        assert_eq!(scanned.is_archive, Some(true));
427        assert!(scanned.license_detections.is_empty());
428        assert!(scanned.copyrights.is_empty());
429        assert!(scanned.emails.is_empty());
430        assert!(scanned.urls.is_empty());
431    }
432
433    #[test]
434    fn scanner_treats_typescript_sources_as_text_not_video_media() {
435        let options = TextDetectionOptions {
436            collect_info: true,
437            detect_packages: false,
438            detect_application_packages: false,
439            detect_system_packages: false,
440            detect_packages_in_compiled: false,
441            detect_copyrights: false,
442            detect_generated: false,
443            detect_emails: false,
444            detect_urls: false,
445            max_emails: 50,
446            max_urls: 50,
447            timeout_seconds: 120.0,
448            scan_cache_dir: None,
449        };
450        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
451
452        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
453        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
454        assert_eq!(
455            scanned.file_type_label.as_deref(),
456            Some("UTF-8 Unicode text")
457        );
458        assert_eq!(scanned.is_text, Some(true));
459        assert_eq!(scanned.is_media, Some(false));
460        assert_eq!(scanned.is_script, Some(false));
461        assert_eq!(scanned.is_source, Some(true));
462    }
463
464    #[test]
465    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
466        let options = TextDetectionOptions {
467            collect_info: true,
468            detect_packages: false,
469            detect_application_packages: false,
470            detect_system_packages: false,
471            detect_packages_in_compiled: false,
472            detect_copyrights: false,
473            detect_generated: false,
474            detect_emails: false,
475            detect_urls: false,
476            max_emails: 50,
477            max_urls: 50,
478            timeout_seconds: 120.0,
479            scan_cache_dir: None,
480        };
481        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
482
483        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
484        assert_eq!(
485            scanned.file_type_label.as_deref(),
486            Some("UTF-8 Unicode text")
487        );
488        assert_eq!(scanned.is_text, Some(true));
489        assert_eq!(scanned.is_media, Some(false));
490        assert_eq!(scanned.is_script, Some(false));
491        assert_eq!(scanned.is_source, Some(true));
492    }
493
494    #[test]
495    fn scanner_treats_empty_files_like_scancode_info_surface() {
496        let options = TextDetectionOptions {
497            collect_info: true,
498            detect_packages: false,
499            detect_application_packages: false,
500            detect_system_packages: false,
501            detect_packages_in_compiled: false,
502            detect_copyrights: false,
503            detect_generated: false,
504            detect_emails: false,
505            detect_urls: false,
506            max_emails: 50,
507            max_urls: 50,
508            timeout_seconds: 120.0,
509            scan_cache_dir: None,
510        };
511        let scanned = scan_single_file("test.txt", "", &options);
512
513        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
514        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
515        assert_eq!(scanned.programming_language, None);
516        assert_eq!(scanned.is_binary, Some(false));
517        assert_eq!(scanned.is_text, Some(true));
518        assert_eq!(scanned.is_archive, Some(false));
519        assert_eq!(scanned.is_media, Some(false));
520        assert_eq!(scanned.is_source, Some(false));
521        assert_eq!(scanned.is_script, Some(false));
522    }
523
524    #[test]
525    fn scanner_treats_package_json_as_text_not_source() {
526        let options = TextDetectionOptions {
527            collect_info: true,
528            detect_packages: false,
529            detect_application_packages: false,
530            detect_system_packages: false,
531            detect_packages_in_compiled: false,
532            detect_copyrights: false,
533            detect_generated: false,
534            detect_emails: false,
535            detect_urls: false,
536            max_emails: 50,
537            max_urls: 50,
538            timeout_seconds: 120.0,
539            scan_cache_dir: None,
540        };
541        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
542
543        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
544        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
545        assert_eq!(scanned.programming_language, None);
546        assert_eq!(scanned.is_text, Some(true));
547        assert_eq!(scanned.is_source, Some(false));
548        assert_eq!(scanned.is_script, Some(false));
549    }
550
551    #[test]
552    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
553        let options = TextDetectionOptions {
554            collect_info: true,
555            detect_packages: false,
556            detect_application_packages: false,
557            detect_system_packages: false,
558            detect_packages_in_compiled: false,
559            detect_copyrights: false,
560            detect_generated: false,
561            detect_emails: false,
562            detect_urls: false,
563            max_emails: 50,
564            max_urls: 50,
565            timeout_seconds: 120.0,
566            scan_cache_dir: None,
567        };
568
569        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
570        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
571
572        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
573        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
574        assert_eq!(gradle.is_source, Some(true));
575        assert_eq!(gradle.is_script, Some(false));
576
577        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
578        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
579        assert_eq!(nix.is_source, Some(true));
580        assert_eq!(nix.is_script, Some(false));
581    }
582
583    #[test]
584    fn scanner_treats_gitmodules_as_text_not_source() {
585        let options = TextDetectionOptions {
586            collect_info: true,
587            detect_packages: false,
588            detect_application_packages: false,
589            detect_system_packages: false,
590            detect_packages_in_compiled: false,
591            detect_copyrights: false,
592            detect_generated: false,
593            detect_emails: false,
594            detect_urls: false,
595            max_emails: 50,
596            max_urls: 50,
597            timeout_seconds: 120.0,
598            scan_cache_dir: None,
599        };
600        let scanned = scan_file_at_relative_path(
601            ".gitmodules",
602            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
603            &options,
604        );
605
606        assert_eq!(scanned.programming_language, None);
607        assert_eq!(
608            scanned.file_type_label.as_deref(),
609            Some("Git configuration text")
610        );
611        assert_eq!(scanned.is_text, Some(true));
612        assert_eq!(scanned.is_source, Some(false));
613        assert_eq!(scanned.is_script, Some(false));
614    }
615
616    #[test]
617    fn scanner_treats_javascript_shebang_files_as_scripts() {
618        let options = TextDetectionOptions {
619            collect_info: true,
620            detect_packages: false,
621            detect_application_packages: false,
622            detect_system_packages: false,
623            detect_packages_in_compiled: false,
624            detect_copyrights: false,
625            detect_generated: false,
626            detect_emails: false,
627            detect_urls: false,
628            max_emails: 50,
629            max_urls: 50,
630            timeout_seconds: 120.0,
631            scan_cache_dir: None,
632        };
633        let scanned = scan_file_at_relative_path(
634            "bin/run",
635            b"#!/usr/bin/env node\nconsole.log('hello');\n",
636            &options,
637        );
638
639        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
640        assert_eq!(
641            scanned.file_type_label.as_deref(),
642            Some("javascript script, UTF-8 Unicode text executable")
643        );
644        assert_eq!(scanned.is_script, Some(true));
645        assert_eq!(scanned.is_source, Some(true));
646    }
647
648    #[test]
649    fn scanner_treats_dockerfile_as_source() {
650        let options = TextDetectionOptions {
651            collect_info: true,
652            detect_packages: false,
653            detect_application_packages: false,
654            detect_system_packages: false,
655            detect_packages_in_compiled: false,
656            detect_copyrights: false,
657            detect_generated: false,
658            detect_emails: false,
659            detect_urls: false,
660            max_emails: 50,
661            max_urls: 50,
662            timeout_seconds: 120.0,
663            scan_cache_dir: None,
664        };
665        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
666
667        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
668        assert_eq!(
669            scanned.file_type_label.as_deref(),
670            Some("UTF-8 Unicode text")
671        );
672        assert_eq!(scanned.is_source, Some(true));
673        assert_eq!(scanned.is_script, Some(false));
674    }
675
676    #[test]
677    fn scanner_treats_makefile_as_text_not_source() {
678        let options = TextDetectionOptions {
679            collect_info: true,
680            detect_packages: false,
681            detect_application_packages: false,
682            detect_system_packages: false,
683            detect_packages_in_compiled: false,
684            detect_copyrights: false,
685            detect_generated: false,
686            detect_emails: false,
687            detect_urls: false,
688            max_emails: 50,
689            max_urls: 50,
690            timeout_seconds: 120.0,
691            scan_cache_dir: None,
692        };
693        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
694
695        assert_eq!(scanned.programming_language, None);
696        assert_eq!(
697            scanned.file_type_label.as_deref(),
698            Some("UTF-8 Unicode text")
699        );
700        assert_eq!(scanned.is_text, Some(true));
701        assert_eq!(scanned.is_source, Some(false));
702        assert_eq!(scanned.is_script, Some(false));
703    }
704
705    #[test]
706    fn scanner_omits_info_surface_when_disabled() {
707        let options = TextDetectionOptions {
708            collect_info: false,
709            detect_packages: false,
710            detect_application_packages: false,
711            detect_system_packages: false,
712            detect_packages_in_compiled: false,
713            detect_copyrights: false,
714            detect_generated: false,
715            detect_emails: false,
716            detect_urls: false,
717            max_emails: 50,
718            max_urls: 50,
719            timeout_seconds: 120.0,
720            scan_cache_dir: None,
721        };
722        let scanned = scan_single_file(
723            "script.py",
724            "#!/usr/bin/env python3\nprint(\"hello\")\n",
725            &options,
726        );
727
728        assert!(scanned.sha1.is_none());
729        assert!(scanned.md5.is_none());
730        assert!(scanned.sha256.is_none());
731        assert!(scanned.sha1_git.is_none());
732        assert!(scanned.mime_type.is_none());
733        assert!(scanned.date.is_none());
734        assert!(scanned.programming_language.is_none());
735        assert!(scanned.is_binary.is_none());
736        assert!(scanned.is_text.is_none());
737        assert!(scanned.is_archive.is_none());
738        assert!(scanned.is_media.is_none());
739        assert!(scanned.is_script.is_none());
740        assert!(scanned.is_source.is_none());
741    }
742
743    #[test]
744    fn scanner_skips_package_parsing_when_disabled() {
745        let options = TextDetectionOptions {
746            collect_info: false,
747            detect_packages: false,
748            detect_application_packages: false,
749            detect_system_packages: false,
750            detect_packages_in_compiled: false,
751            detect_copyrights: false,
752            detect_generated: false,
753            detect_emails: false,
754            detect_urls: false,
755            max_emails: 50,
756            max_urls: 50,
757            timeout_seconds: 120.0,
758            scan_cache_dir: None,
759        };
760        let scanned = scan_single_file(
761            "package.json",
762            r#"{"name":"demo","version":"1.0.0"}"#,
763            &options,
764        );
765
766        assert!(
767            scanned.package_data.is_empty(),
768            "package_data: {:#?}",
769            scanned.package_data
770        );
771    }
772
773    #[test]
774    fn scanner_parses_package_manifests_when_enabled() {
775        let options = TextDetectionOptions {
776            collect_info: false,
777            detect_packages: true,
778            detect_application_packages: true,
779            detect_system_packages: false,
780            detect_packages_in_compiled: false,
781            detect_copyrights: false,
782            detect_generated: false,
783            detect_emails: false,
784            detect_urls: false,
785            max_emails: 50,
786            max_urls: 50,
787            timeout_seconds: 120.0,
788            scan_cache_dir: None,
789        };
790        let scanned = scan_single_file(
791            "package.json",
792            r#"{"name":"demo","version":"1.0.0"}"#,
793            &options,
794        );
795
796        assert_eq!(
797            scanned.package_data.len(),
798            1,
799            "package_data: {:#?}",
800            scanned.package_data
801        );
802    }
803
804    #[test]
805    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
806        let options = TextDetectionOptions {
807            collect_info: false,
808            detect_packages: true,
809            detect_application_packages: false,
810            detect_system_packages: true,
811            detect_packages_in_compiled: false,
812            detect_copyrights: false,
813            detect_generated: false,
814            detect_emails: false,
815            detect_urls: false,
816            max_emails: 50,
817            max_urls: 50,
818            timeout_seconds: 120.0,
819            scan_cache_dir: None,
820        };
821        let scanned = scan_single_file(
822            "package.json",
823            r#"{"name":"demo","version":"1.0.0"}"#,
824            &options,
825        );
826
827        assert!(
828            scanned.package_data.is_empty(),
829            "package_data: {:#?}",
830            scanned.package_data
831        );
832    }
833
834    #[test]
835    fn scanner_parses_system_package_files_when_enabled() {
836        let options = TextDetectionOptions {
837            collect_info: false,
838            detect_packages: true,
839            detect_application_packages: false,
840            detect_system_packages: true,
841            detect_packages_in_compiled: false,
842            detect_copyrights: false,
843            detect_generated: false,
844            detect_emails: false,
845            detect_urls: false,
846            max_emails: 50,
847            max_urls: 50,
848            timeout_seconds: 120.0,
849            scan_cache_dir: None,
850        };
851        let scanned = scan_file_at_relative_path(
852            "var/lib/dpkg/status",
853            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
854            &options,
855        );
856
857        assert!(
858            !scanned.package_data.is_empty(),
859            "package_data: {:#?}",
860            scanned.package_data
861        );
862    }
863
864    #[test]
865    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
866        let temp_dir = TempDir::new().expect("create temp dir");
867        fs::write(
868            temp_dir.path().join("go.mod"),
869            "module example.com/demo\n\ngo 1.23.0\n",
870        )
871        .expect("write go.mod");
872        fs::write(
873            temp_dir.path().join("main.go"),
874            "package main\nfunc main() {}\n",
875        )
876        .expect("write main.go");
877        let file_path = temp_dir.path().join("demo");
878        let status = std::process::Command::new("go")
879            .current_dir(temp_dir.path())
880            .args(["build", "-o"])
881            .arg(&file_path)
882            .status()
883            .expect("run go build");
884        assert!(status.success());
885
886        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
887        let collected = collect_paths(temp_dir.path(), 0, &[]);
888
889        let without_compiled = process_collected(
890            &collected,
891            Arc::clone(&progress),
892            None,
893            LicenseScanOptions::default(),
894            &TextDetectionOptions {
895                collect_info: false,
896                detect_packages: true,
897                detect_application_packages: true,
898                detect_system_packages: false,
899                detect_packages_in_compiled: false,
900                detect_copyrights: false,
901                detect_generated: false,
902                detect_emails: false,
903                detect_urls: false,
904                max_emails: 50,
905                max_urls: 50,
906                timeout_seconds: 120.0,
907                scan_cache_dir: None,
908            },
909        );
910        let with_compiled = process_collected(
911            &collected,
912            progress,
913            None,
914            LicenseScanOptions::default(),
915            &TextDetectionOptions {
916                collect_info: false,
917                detect_packages: true,
918                detect_application_packages: true,
919                detect_system_packages: false,
920                detect_packages_in_compiled: true,
921                detect_copyrights: false,
922                detect_generated: false,
923                detect_emails: false,
924                detect_urls: false,
925                max_emails: 50,
926                max_urls: 50,
927                timeout_seconds: 120.0,
928                scan_cache_dir: None,
929            },
930        );
931
932        let without_compiled = without_compiled
933            .files
934            .into_iter()
935            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
936            .expect("compiled artifact present");
937        let with_compiled = with_compiled
938            .files
939            .into_iter()
940            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
941            .expect("compiled artifact present");
942
943        assert!(
944            without_compiled.package_data.is_empty(),
945            "package_data: {:#?}",
946            without_compiled.package_data
947        );
948        assert!(!with_compiled.package_data.is_empty());
949    }
950
951    #[test]
952    fn scanner_sets_is_source_only_when_info_enabled() {
953        let without_info = TextDetectionOptions {
954            collect_info: false,
955            detect_packages: false,
956            detect_application_packages: false,
957            detect_system_packages: false,
958            detect_packages_in_compiled: false,
959            detect_copyrights: false,
960            detect_generated: false,
961            detect_emails: false,
962            detect_urls: false,
963            max_emails: 50,
964            max_urls: 50,
965            timeout_seconds: 120.0,
966            scan_cache_dir: None,
967        };
968        let with_info = TextDetectionOptions {
969            collect_info: true,
970            ..without_info.clone()
971        };
972
973        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
974        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
975
976        assert_eq!(scanned_without_info.is_source, None);
977        assert_eq!(scanned_with_info.is_source, Some(true));
978    }
979
980    #[test]
981    fn directory_omits_info_fields_when_info_disabled() {
982        let temp_dir = TempDir::new().expect("create temp dir");
983        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
984
985        let collected = collect_paths(temp_dir.path(), 0, &[]);
986        let result = process_collected(
987            &collected,
988            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
989            None,
990            LicenseScanOptions::default(),
991            &TextDetectionOptions {
992                collect_info: false,
993                detect_packages: false,
994                detect_application_packages: false,
995                detect_system_packages: false,
996                detect_packages_in_compiled: false,
997                detect_copyrights: false,
998                detect_generated: false,
999                detect_emails: false,
1000                detect_urls: false,
1001                max_emails: 50,
1002                max_urls: 50,
1003                timeout_seconds: 120.0,
1004                scan_cache_dir: None,
1005            },
1006        );
1007
1008        let directory = result
1009            .files
1010            .into_iter()
1011            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1012            .expect("directory entry");
1013
1014        assert!(directory.date.is_none());
1015        assert!(directory.file_type_label.is_none());
1016        assert!(directory.is_binary.is_none());
1017        assert!(directory.is_text.is_none());
1018        assert!(directory.is_archive.is_none());
1019        assert!(directory.is_media.is_none());
1020        assert!(directory.is_source.is_none());
1021        assert!(directory.is_script.is_none());
1022    }
1023
1024    #[test]
1025    fn directory_includes_info_fields_when_info_enabled() {
1026        let temp_dir = TempDir::new().expect("create temp dir");
1027        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1028
1029        let collected = collect_paths(temp_dir.path(), 0, &[]);
1030        let result = process_collected(
1031            &collected,
1032            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1033            None,
1034            LicenseScanOptions::default(),
1035            &TextDetectionOptions {
1036                collect_info: true,
1037                detect_packages: false,
1038                detect_application_packages: false,
1039                detect_system_packages: false,
1040                detect_packages_in_compiled: false,
1041                detect_copyrights: false,
1042                detect_generated: false,
1043                detect_emails: false,
1044                detect_urls: false,
1045                max_emails: 50,
1046                max_urls: 50,
1047                timeout_seconds: 120.0,
1048                scan_cache_dir: None,
1049            },
1050        );
1051
1052        let directory = result
1053            .files
1054            .into_iter()
1055            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1056            .expect("directory entry");
1057
1058        assert!(directory.date.is_none());
1059        assert!(directory.file_type_label.is_none());
1060        assert_eq!(directory.is_binary, Some(false));
1061        assert_eq!(directory.is_text, Some(false));
1062        assert_eq!(directory.is_archive, Some(false));
1063        assert_eq!(directory.is_media, Some(false));
1064        assert_eq!(directory.is_source, Some(false));
1065        assert_eq!(directory.is_script, Some(false));
1066        assert_eq!(directory.files_count, Some(0));
1067        assert_eq!(directory.dirs_count, Some(0));
1068        assert_eq!(directory.size_count, Some(0));
1069    }
1070
1071    #[test]
1072    fn collect_paths_includes_root_directory_entry() {
1073        let temp_dir = TempDir::new().expect("create temp dir");
1074        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1075        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1076            .expect("write nested file");
1077
1078        let collected = collect_paths(temp_dir.path(), 0, &[]);
1079
1080        assert!(
1081            collected
1082                .directories
1083                .iter()
1084                .any(|(path, _)| path == temp_dir.path())
1085        );
1086    }
1087
1088    #[test]
1089    fn collect_paths_supports_single_file_input() {
1090        let temp_dir = TempDir::new().expect("create temp dir");
1091        let file_path = temp_dir.path().join("main.rs");
1092        fs::write(&file_path, "fn main() {}\n").expect("write file");
1093
1094        let collected = collect_paths(&file_path, 0, &[]);
1095
1096        assert_eq!(collected.files.len(), 1);
1097        assert!(collected.directories.is_empty());
1098        assert_eq!(collected.files[0].0, file_path);
1099    }
1100
1101    #[test]
1102    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1103        let temp_dir = TempDir::new().expect("create temp dir");
1104        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1105        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1106
1107        let collected = collect_paths(temp_dir.path(), 0, &[]);
1108        let result = process_collected_with_memory_limit(
1109            &collected,
1110            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1111            None,
1112            LicenseScanOptions::default(),
1113            &TextDetectionOptions {
1114                collect_info: false,
1115                detect_packages: false,
1116                detect_application_packages: false,
1117                detect_system_packages: false,
1118                detect_packages_in_compiled: false,
1119                detect_copyrights: false,
1120                detect_generated: false,
1121                detect_emails: false,
1122                detect_urls: false,
1123                max_emails: 50,
1124                max_urls: 50,
1125                timeout_seconds: 120.0,
1126                scan_cache_dir: None,
1127            },
1128            1,
1129        );
1130
1131        assert_eq!(result.files.len(), 3);
1132    }
1133
1134    #[test]
1135    fn process_collected_with_negative_one_uses_disk_only_mode() {
1136        let temp_dir = TempDir::new().expect("create temp dir");
1137        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1138
1139        let collected = collect_paths(temp_dir.path(), 0, &[]);
1140        let result = process_collected_with_memory_limit(
1141            &collected,
1142            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1143            None,
1144            LicenseScanOptions::default(),
1145            &TextDetectionOptions {
1146                collect_info: false,
1147                detect_packages: false,
1148                detect_application_packages: false,
1149                detect_system_packages: false,
1150                detect_packages_in_compiled: false,
1151                detect_copyrights: false,
1152                detect_generated: false,
1153                detect_emails: false,
1154                detect_urls: false,
1155                max_emails: 50,
1156                max_urls: 50,
1157                timeout_seconds: 120.0,
1158                scan_cache_dir: None,
1159            },
1160            -1,
1161        );
1162
1163        assert_eq!(result.files.len(), 2);
1164    }
1165}