Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use crate::license_detection::LicenseDetectionEngine;
5use crate::models::FileInfo;
6
7pub struct ProcessResult {
8    pub files: Vec<FileInfo>,
9    pub excluded_count: usize,
10}
11
12#[derive(Debug, Clone, Copy, Default)]
13pub struct LicenseScanOptions {
14    pub include_text: bool,
15    pub include_text_diagnostics: bool,
16    pub include_diagnostics: bool,
17    pub unknown_licenses: bool,
18    pub min_score: u8,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23    pub collect_info: bool,
24    pub detect_packages: bool,
25    pub detect_application_packages: bool,
26    pub detect_system_packages: bool,
27    pub detect_packages_in_compiled: bool,
28    pub detect_copyrights: bool,
29    pub detect_generated: bool,
30    pub detect_emails: bool,
31    pub detect_urls: bool,
32    pub max_emails: usize,
33    pub max_urls: usize,
34    pub timeout_seconds: f64,
35}
36
37impl Default for TextDetectionOptions {
38    fn default() -> Self {
39        Self {
40            collect_info: false,
41            detect_packages: false,
42            detect_application_packages: false,
43            detect_system_packages: false,
44            detect_packages_in_compiled: false,
45            detect_copyrights: true,
46            detect_generated: false,
47            detect_emails: false,
48            detect_urls: false,
49            max_emails: 50,
50            max_urls: 50,
51            timeout_seconds: 120.0,
52        }
53    }
54}
55
56pub fn scan_options_fingerprint(
57    text_options: &TextDetectionOptions,
58    license_options: LicenseScanOptions,
59    license_engine: Option<&LicenseDetectionEngine>,
60) -> String {
61    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
62        Some(engine) => {
63            let rules = &engine.index().rules_by_rid;
64            (
65                true,
66                rules.len(),
67                rules
68                    .first()
69                    .map(|rule| rule.identifier.as_str())
70                    .unwrap_or(""),
71                rules
72                    .last()
73                    .map(|rule| rule.identifier.as_str())
74                    .unwrap_or(""),
75            )
76        }
77        None => (false, 0, "", ""),
78    };
79
80    format!(
81        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
82        crate::version::BUILD_VERSION,
83        text_options.collect_info,
84        text_options.detect_packages,
85        text_options.detect_application_packages,
86        text_options.detect_system_packages,
87        text_options.detect_packages_in_compiled,
88        text_options.detect_copyrights,
89        text_options.detect_generated,
90        text_options.detect_emails,
91        text_options.detect_urls,
92        text_options.max_emails,
93        text_options.max_urls,
94        text_options.timeout_seconds,
95        license_enabled,
96        rules_count,
97        first_rule_id,
98        last_rule_id,
99        license_options.include_text,
100        license_options.include_text_diagnostics,
101        license_options.include_diagnostics,
102        license_options.unknown_licenses,
103        license_options.min_score,
104    )
105}
106
107pub use self::collect::{CollectedPaths, collect_paths};
108#[allow(unused_imports)]
109pub use self::process::{
110    MemoryMode, process_collected, process_collected_sequential,
111    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
112};
113
114#[cfg(test)]
115mod tests {
116    use std::fs;
117    use std::sync::Arc;
118
119    use tempfile::TempDir;
120
121    use crate::license_detection::LicenseDetectionEngine;
122    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
123    use crate::progress::{ProgressMode, ScanProgress};
124
125    use super::{
126        LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths, process_collected,
127        process_collected_with_memory_limit, scan_options_fingerprint,
128    };
129
130    #[test]
131    fn default_options_keep_copyright_detection_enabled() {
132        let options = TextDetectionOptions::default();
133        assert!(!options.detect_packages);
134        assert!(options.detect_copyrights);
135    }
136
137    #[test]
138    fn test_scan_options_fingerprint_changes_with_license_score() {
139        let text_options = TextDetectionOptions::default();
140        let default_fingerprint = scan_options_fingerprint(
141            &text_options,
142            LicenseScanOptions {
143                min_score: 0,
144                ..LicenseScanOptions::default()
145            },
146            None,
147        );
148        let filtered_fingerprint = scan_options_fingerprint(
149            &text_options,
150            LicenseScanOptions {
151                min_score: 70,
152                ..LicenseScanOptions::default()
153            },
154            None,
155        );
156
157        assert_ne!(default_fingerprint, filtered_fingerprint);
158    }
159
160    fn scan_single_file(
161        file_name: &str,
162        content: &str,
163        options: &TextDetectionOptions,
164    ) -> crate::models::FileInfo {
165        let temp_dir = TempDir::new().expect("create temp dir");
166        let file_path = temp_dir.path().join(file_name);
167        fs::write(&file_path, content).expect("write test file");
168
169        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
170        let collected = collect_paths(temp_dir.path(), 0, &[]);
171        let result = process_collected(
172            &collected,
173            progress,
174            None,
175            LicenseScanOptions::default(),
176            options,
177        );
178
179        result
180            .files
181            .into_iter()
182            .find(|entry| {
183                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
184            })
185            .expect("scanned file entry")
186    }
187
188    fn scan_file_at_relative_path(
189        relative_path: &str,
190        content: &[u8],
191        options: &TextDetectionOptions,
192    ) -> crate::models::FileInfo {
193        let temp_dir = TempDir::new().expect("create temp dir");
194        let file_path = temp_dir.path().join(relative_path);
195        if let Some(parent) = file_path.parent() {
196            fs::create_dir_all(parent).expect("create parent dirs");
197        }
198        fs::write(&file_path, content).expect("write test file");
199
200        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
201        let collected = collect_paths(temp_dir.path(), 0, &[]);
202        let result = process_collected(
203            &collected,
204            progress,
205            None,
206            LicenseScanOptions::default(),
207            options,
208        );
209
210        result
211            .files
212            .into_iter()
213            .find(|entry| {
214                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
215            })
216            .expect("scanned file entry")
217    }
218
219    fn scan_single_file_with_license_engine(
220        file_name: &str,
221        content: &str,
222        options: &TextDetectionOptions,
223    ) -> crate::models::FileInfo {
224        let temp_dir = TempDir::new().expect("create temp dir");
225        let file_path = temp_dir.path().join(file_name);
226        fs::write(&file_path, content).expect("write test file");
227
228        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
229        let collected = collect_paths(temp_dir.path(), 0, &[]);
230        let engine =
231            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
232        let result = process_collected(
233            &collected,
234            progress,
235            Some(engine),
236            LicenseScanOptions::default(),
237            options,
238        );
239
240        result
241            .files
242            .into_iter()
243            .find(|entry| {
244                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
245            })
246            .expect("scanned file entry")
247    }
248
249    #[test]
250    fn scanner_reports_repeated_email_occurrences() {
251        let options = TextDetectionOptions {
252            collect_info: false,
253            detect_packages: false,
254            detect_application_packages: false,
255            detect_system_packages: false,
256            detect_packages_in_compiled: false,
257            detect_copyrights: false,
258            detect_generated: false,
259            detect_emails: true,
260            detect_urls: false,
261            max_emails: 50,
262            max_urls: 50,
263            timeout_seconds: 120.0,
264        };
265        let scanned = scan_single_file(
266            "contacts.txt",
267            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
268            &options,
269        );
270
271        let emails: Vec<(&str, usize)> = scanned
272            .emails
273            .iter()
274            .map(|email| (email.email.as_str(), email.start_line.get()))
275            .collect();
276
277        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
278        assert_eq!(
279            emails,
280            vec![
281                ("linux@3ware.com", 1),
282                ("linux@3ware.com", 2),
283                ("andre@suse.com", 3),
284                ("linux@3ware.com", 4),
285            ]
286        );
287    }
288
289    #[test]
290    fn scanner_skips_pem_certificate_text_detection() {
291        let options = TextDetectionOptions {
292            collect_info: false,
293            detect_packages: false,
294            detect_application_packages: false,
295            detect_system_packages: false,
296            detect_packages_in_compiled: false,
297            detect_copyrights: true,
298            detect_generated: false,
299            detect_emails: true,
300            detect_urls: true,
301            max_emails: 50,
302            max_urls: 50,
303            timeout_seconds: 120.0,
304        };
305        let pem_fixture = concat!(
306            "-----BEGIN CERTIFICATE-----\n",
307            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
308            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
309            "-----END CERTIFICATE-----\n",
310            "Certificate:\n",
311            "    Data:\n",
312            "        Signature Algorithm: sha1WithRSAEncryption\n",
313            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
314            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
315            "        Contact: cert-owner@example.com\n",
316        );
317        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
318
319        assert!(
320            scanned.copyrights.is_empty(),
321            "copyrights: {:#?}",
322            scanned.copyrights
323        );
324        assert!(
325            scanned.holders.is_empty(),
326            "holders: {:#?}",
327            scanned.holders
328        );
329        assert!(
330            scanned.authors.is_empty(),
331            "authors: {:#?}",
332            scanned.authors
333        );
334        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
335        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
336        assert!(
337            scanned.license_detections.is_empty(),
338            "licenses: {:#?}",
339            scanned.license_detections
340        );
341        assert!(
342            scanned.license_clues.is_empty(),
343            "license clues: {:#?}",
344            scanned.license_clues
345        );
346    }
347
348    #[test]
349    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
350        let options = TextDetectionOptions {
351            collect_info: false,
352            detect_packages: false,
353            detect_application_packages: false,
354            detect_system_packages: false,
355            detect_packages_in_compiled: false,
356            detect_copyrights: true,
357            detect_generated: false,
358            detect_emails: false,
359            detect_urls: true,
360            max_emails: 50,
361            max_urls: 50,
362            timeout_seconds: 120.0,
363        };
364        let fixture = concat!(
365            "/*\n",
366            "Copyright 2022 The Kubernetes Authors.\n\n",
367            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
368            "you may not use this file except in compliance with the License.\n",
369            "You may obtain a copy of the License at\n\n",
370            "    http://www.apache.org/licenses/LICENSE-2.0\n",
371            "*/\n\n",
372            "package storage\n\n",
373            "const validCert = `\n",
374            "-----BEGIN CERTIFICATE-----\n",
375            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
376            "-----END CERTIFICATE-----\n",
377            "`\n",
378        );
379        let temp_dir = TempDir::new().expect("create temp dir");
380        let file_path = temp_dir.path().join("storage_test.go");
381        fs::write(&file_path, fixture).expect("write fixture");
382
383        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
384        let collected = collect_paths(temp_dir.path(), 0, &[]);
385        let engine =
386            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
387        let result = process_collected(
388            &collected,
389            progress,
390            Some(engine),
391            LicenseScanOptions::default(),
392            &options,
393        );
394        let scanned = result
395            .files
396            .into_iter()
397            .find(|entry| {
398                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
399            })
400            .expect("scanned file entry");
401
402        assert!(
403            scanned
404                .copyrights
405                .iter()
406                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
407            "copyrights: {:#?}",
408            scanned.copyrights
409        );
410        assert!(
411            scanned
412                .holders
413                .iter()
414                .any(|h| h.holder == "The Kubernetes Authors"),
415            "holders: {:#?}",
416            scanned.holders
417        );
418        assert!(
419            scanned
420                .urls
421                .iter()
422                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
423            "urls: {:#?}",
424            scanned.urls
425        );
426        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
427    }
428
429    #[test]
430    fn scanner_detects_structured_credits_authors() {
431        let options = TextDetectionOptions {
432            collect_info: false,
433            detect_packages: false,
434            detect_application_packages: false,
435            detect_system_packages: false,
436            detect_packages_in_compiled: false,
437            detect_copyrights: true,
438            detect_generated: false,
439            detect_emails: false,
440            detect_urls: false,
441            max_emails: 50,
442            max_urls: 50,
443            timeout_seconds: 120.0,
444        };
445        let credits_fixture = concat!(
446            "N: Jack Lloyd\n",
447            "E: lloyd@randombit.net\n",
448            "W: http://www.randombit.net/\n",
449        );
450        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
451
452        let authors: Vec<(&str, usize, usize)> = scanned
453            .authors
454            .iter()
455            .map(|author| {
456                (
457                    author.author.as_str(),
458                    author.start_line.get(),
459                    author.end_line.get(),
460                )
461            })
462            .collect();
463
464        assert_eq!(
465            authors,
466            vec![(
467                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
468                1,
469                3,
470            )]
471        );
472        assert!(scanned.copyrights.is_empty());
473        assert!(scanned.holders.is_empty());
474    }
475
476    #[test]
477    fn scanner_uses_or_for_alternative_license_header() {
478        let fixture =
479            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
480        let temp_dir = TempDir::new().expect("create temp dir");
481        let file_path = temp_dir.path().join("d2s.ipp");
482        fs::write(&file_path, fixture).expect("write fixture");
483
484        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
485        let collected = collect_paths(temp_dir.path(), 0, &[]);
486        let engine =
487            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
488        let result = process_collected(
489            &collected,
490            progress,
491            Some(engine),
492            LicenseScanOptions::default(),
493            &TextDetectionOptions::default(),
494        );
495        let scanned = result
496            .files
497            .into_iter()
498            .find(|entry| {
499                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
500            })
501            .expect("scanned file entry");
502
503        assert_eq!(
504            scanned.license_expression.as_deref(),
505            Some("Apache-2.0 OR BSL-1.0")
506        );
507        assert!(
508            scanned.license_clues.is_empty(),
509            "license clues: {:#?}",
510            scanned.license_clues
511        );
512        assert_eq!(
513            scanned.license_detections.len(),
514            1,
515            "detections: {:#?}",
516            scanned.license_detections
517        );
518
519        let detection = &scanned.license_detections[0];
520        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
521
522        let match_expressions: Vec<_> = detection
523            .matches
524            .iter()
525            .map(|m| m.license_expression_spdx.as_str())
526            .collect();
527        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
528    }
529
530    #[test]
531    fn scanner_sets_generated_flag_when_enabled() {
532        let options = TextDetectionOptions {
533            collect_info: false,
534            detect_packages: false,
535            detect_application_packages: false,
536            detect_system_packages: false,
537            detect_packages_in_compiled: false,
538            detect_copyrights: false,
539            detect_generated: true,
540            detect_emails: false,
541            detect_urls: false,
542            max_emails: 50,
543            max_urls: 50,
544            timeout_seconds: 120.0,
545        };
546        let scanned = scan_single_file(
547            "generated.c",
548            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
549            &options,
550        );
551
552        assert_eq!(scanned.is_generated, Some(true));
553    }
554
555    #[test]
556    fn scanner_leaves_generated_flag_unset_when_disabled() {
557        let options = TextDetectionOptions {
558            collect_info: false,
559            detect_packages: false,
560            detect_application_packages: false,
561            detect_system_packages: false,
562            detect_packages_in_compiled: false,
563            detect_copyrights: false,
564            detect_generated: false,
565            detect_emails: false,
566            detect_urls: false,
567            max_emails: 50,
568            max_urls: 50,
569            timeout_seconds: 120.0,
570        };
571        let scanned = scan_single_file(
572            "generated.c",
573            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
574            &options,
575        );
576
577        assert_eq!(scanned.is_generated, None);
578    }
579
580    #[test]
581    fn scanner_populates_info_surface_when_enabled() {
582        let options = TextDetectionOptions {
583            collect_info: true,
584            detect_packages: false,
585            detect_application_packages: false,
586            detect_system_packages: false,
587            detect_packages_in_compiled: false,
588            detect_copyrights: false,
589            detect_generated: false,
590            detect_emails: false,
591            detect_urls: false,
592            max_emails: 50,
593            max_urls: 50,
594            timeout_seconds: 120.0,
595        };
596        let scanned = scan_single_file(
597            "script.py",
598            "#!/usr/bin/env python3\nprint(\"hello\")\n",
599            &options,
600        );
601
602        assert!(scanned.sha1.is_some());
603        assert!(scanned.md5.is_some());
604        assert!(scanned.sha256.is_some());
605        assert!(scanned.sha1_git.is_some());
606        assert!(scanned.mime_type.is_some());
607        assert!(scanned.date.is_some());
608        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
609        assert_eq!(scanned.is_text, Some(true));
610        assert_eq!(scanned.is_script, Some(true));
611        assert_eq!(scanned.is_source, Some(true));
612    }
613
614    #[test]
615    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
616        let options = TextDetectionOptions {
617            collect_info: true,
618            detect_packages: false,
619            detect_application_packages: false,
620            detect_system_packages: false,
621            detect_packages_in_compiled: false,
622            detect_copyrights: false,
623            detect_generated: false,
624            detect_emails: false,
625            detect_urls: false,
626            max_emails: 50,
627            max_urls: 50,
628            timeout_seconds: 120.0,
629        };
630        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
631        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
632
633        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
634        assert_eq!(
635            scanned.file_type_label.as_deref(),
636            Some("python script, text executable")
637        );
638        assert_eq!(scanned.is_binary, Some(false));
639        assert_eq!(scanned.is_text, Some(true));
640        assert_eq!(scanned.is_script, Some(true));
641        assert_eq!(scanned.is_source, Some(true));
642    }
643
644    #[test]
645    fn scanner_skips_findings_for_zip_like_archives() {
646        let options = TextDetectionOptions {
647            collect_info: true,
648            detect_packages: false,
649            detect_application_packages: false,
650            detect_system_packages: false,
651            detect_packages_in_compiled: false,
652            detect_copyrights: true,
653            detect_generated: false,
654            detect_emails: true,
655            detect_urls: true,
656            max_emails: 50,
657            max_urls: 50,
658            timeout_seconds: 120.0,
659        };
660        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
661        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
662
663        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
664        assert_eq!(scanned.is_archive, Some(true));
665        assert!(scanned.license_detections.is_empty());
666        assert!(scanned.copyrights.is_empty());
667        assert!(scanned.emails.is_empty());
668        assert!(scanned.urls.is_empty());
669    }
670
671    #[test]
672    fn scanner_treats_typescript_sources_as_text_not_video_media() {
673        let options = TextDetectionOptions {
674            collect_info: true,
675            detect_packages: false,
676            detect_application_packages: false,
677            detect_system_packages: false,
678            detect_packages_in_compiled: false,
679            detect_copyrights: false,
680            detect_generated: false,
681            detect_emails: false,
682            detect_urls: false,
683            max_emails: 50,
684            max_urls: 50,
685            timeout_seconds: 120.0,
686        };
687        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
688
689        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
690        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
691        assert_eq!(
692            scanned.file_type_label.as_deref(),
693            Some("UTF-8 Unicode text")
694        );
695        assert_eq!(scanned.is_text, Some(true));
696        assert_eq!(scanned.is_media, Some(false));
697        assert_eq!(scanned.is_script, Some(false));
698        assert_eq!(scanned.is_source, Some(true));
699    }
700
701    #[test]
702    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
703        let options = TextDetectionOptions {
704            collect_info: true,
705            detect_packages: false,
706            detect_application_packages: false,
707            detect_system_packages: false,
708            detect_packages_in_compiled: false,
709            detect_copyrights: false,
710            detect_generated: false,
711            detect_emails: false,
712            detect_urls: false,
713            max_emails: 50,
714            max_urls: 50,
715            timeout_seconds: 120.0,
716        };
717        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
718
719        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
720        assert_eq!(
721            scanned.file_type_label.as_deref(),
722            Some("UTF-8 Unicode text")
723        );
724        assert_eq!(scanned.is_text, Some(true));
725        assert_eq!(scanned.is_media, Some(false));
726        assert_eq!(scanned.is_script, Some(false));
727        assert_eq!(scanned.is_source, Some(true));
728    }
729
730    #[test]
731    fn scanner_treats_empty_files_like_scancode_info_surface() {
732        let options = TextDetectionOptions {
733            collect_info: true,
734            detect_packages: false,
735            detect_application_packages: false,
736            detect_system_packages: false,
737            detect_packages_in_compiled: false,
738            detect_copyrights: false,
739            detect_generated: false,
740            detect_emails: false,
741            detect_urls: false,
742            max_emails: 50,
743            max_urls: 50,
744            timeout_seconds: 120.0,
745        };
746        let scanned = scan_single_file("test.txt", "", &options);
747
748        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
749        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
750        assert_eq!(scanned.programming_language, None);
751        assert_eq!(scanned.is_binary, Some(false));
752        assert_eq!(scanned.is_text, Some(true));
753        assert_eq!(scanned.is_archive, Some(false));
754        assert_eq!(scanned.is_media, Some(false));
755        assert_eq!(scanned.is_source, Some(false));
756        assert_eq!(scanned.is_script, Some(false));
757    }
758
759    #[test]
760    fn scanner_treats_package_json_as_text_not_source() {
761        let options = TextDetectionOptions {
762            collect_info: true,
763            detect_packages: false,
764            detect_application_packages: false,
765            detect_system_packages: false,
766            detect_packages_in_compiled: false,
767            detect_copyrights: false,
768            detect_generated: false,
769            detect_emails: false,
770            detect_urls: false,
771            max_emails: 50,
772            max_urls: 50,
773            timeout_seconds: 120.0,
774        };
775        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
776
777        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
778        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
779        assert_eq!(scanned.programming_language, None);
780        assert_eq!(scanned.is_text, Some(true));
781        assert_eq!(scanned.is_source, Some(false));
782        assert_eq!(scanned.is_script, Some(false));
783    }
784
785    #[test]
786    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
787        let options = TextDetectionOptions {
788            collect_info: true,
789            detect_packages: false,
790            detect_application_packages: false,
791            detect_system_packages: false,
792            detect_packages_in_compiled: false,
793            detect_copyrights: false,
794            detect_generated: false,
795            detect_emails: false,
796            detect_urls: false,
797            max_emails: 50,
798            max_urls: 50,
799            timeout_seconds: 120.0,
800        };
801
802        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
803        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
804
805        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
806        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
807        assert_eq!(gradle.is_source, Some(true));
808        assert_eq!(gradle.is_script, Some(false));
809
810        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
811        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
812        assert_eq!(nix.is_source, Some(true));
813        assert_eq!(nix.is_script, Some(false));
814    }
815
816    #[test]
817    fn scanner_treats_gitmodules_as_text_not_source() {
818        let options = TextDetectionOptions {
819            collect_info: true,
820            detect_packages: false,
821            detect_application_packages: false,
822            detect_system_packages: false,
823            detect_packages_in_compiled: false,
824            detect_copyrights: false,
825            detect_generated: false,
826            detect_emails: false,
827            detect_urls: false,
828            max_emails: 50,
829            max_urls: 50,
830            timeout_seconds: 120.0,
831        };
832        let scanned = scan_file_at_relative_path(
833            ".gitmodules",
834            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
835            &options,
836        );
837
838        assert_eq!(scanned.programming_language, None);
839        assert_eq!(
840            scanned.file_type_label.as_deref(),
841            Some("Git configuration text")
842        );
843        assert_eq!(scanned.is_text, Some(true));
844        assert_eq!(scanned.is_source, Some(false));
845        assert_eq!(scanned.is_script, Some(false));
846    }
847
848    #[test]
849    fn scanner_treats_javascript_shebang_files_as_scripts() {
850        let options = TextDetectionOptions {
851            collect_info: true,
852            detect_packages: false,
853            detect_application_packages: false,
854            detect_system_packages: false,
855            detect_packages_in_compiled: false,
856            detect_copyrights: false,
857            detect_generated: false,
858            detect_emails: false,
859            detect_urls: false,
860            max_emails: 50,
861            max_urls: 50,
862            timeout_seconds: 120.0,
863        };
864        let scanned = scan_file_at_relative_path(
865            "bin/run",
866            b"#!/usr/bin/env node\nconsole.log('hello');\n",
867            &options,
868        );
869
870        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
871        assert_eq!(
872            scanned.file_type_label.as_deref(),
873            Some("javascript script, UTF-8 Unicode text executable")
874        );
875        assert_eq!(scanned.is_script, Some(true));
876        assert_eq!(scanned.is_source, Some(true));
877    }
878
879    #[test]
880    fn scanner_treats_dockerfile_as_source() {
881        let options = TextDetectionOptions {
882            collect_info: true,
883            detect_packages: false,
884            detect_application_packages: false,
885            detect_system_packages: false,
886            detect_packages_in_compiled: false,
887            detect_copyrights: false,
888            detect_generated: false,
889            detect_emails: false,
890            detect_urls: false,
891            max_emails: 50,
892            max_urls: 50,
893            timeout_seconds: 120.0,
894        };
895        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
896
897        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
898        assert_eq!(
899            scanned.file_type_label.as_deref(),
900            Some("UTF-8 Unicode text")
901        );
902        assert_eq!(scanned.is_source, Some(true));
903        assert_eq!(scanned.is_script, Some(false));
904    }
905
906    #[test]
907    fn scanner_treats_makefile_as_text_not_source() {
908        let options = TextDetectionOptions {
909            collect_info: true,
910            detect_packages: false,
911            detect_application_packages: false,
912            detect_system_packages: false,
913            detect_packages_in_compiled: false,
914            detect_copyrights: false,
915            detect_generated: false,
916            detect_emails: false,
917            detect_urls: false,
918            max_emails: 50,
919            max_urls: 50,
920            timeout_seconds: 120.0,
921        };
922        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
923
924        assert_eq!(scanned.programming_language, None);
925        assert_eq!(
926            scanned.file_type_label.as_deref(),
927            Some("UTF-8 Unicode text")
928        );
929        assert_eq!(scanned.is_text, Some(true));
930        assert_eq!(scanned.is_source, Some(false));
931        assert_eq!(scanned.is_script, Some(false));
932    }
933
934    #[test]
935    fn scanner_omits_info_surface_when_disabled() {
936        let options = TextDetectionOptions {
937            collect_info: false,
938            detect_packages: false,
939            detect_application_packages: false,
940            detect_system_packages: false,
941            detect_packages_in_compiled: false,
942            detect_copyrights: false,
943            detect_generated: false,
944            detect_emails: false,
945            detect_urls: false,
946            max_emails: 50,
947            max_urls: 50,
948            timeout_seconds: 120.0,
949        };
950        let scanned = scan_single_file(
951            "script.py",
952            "#!/usr/bin/env python3\nprint(\"hello\")\n",
953            &options,
954        );
955
956        assert!(scanned.sha1.is_none());
957        assert!(scanned.md5.is_none());
958        assert!(scanned.sha256.is_none());
959        assert!(scanned.sha1_git.is_none());
960        assert!(scanned.mime_type.is_none());
961        assert!(scanned.date.is_none());
962        assert!(scanned.programming_language.is_none());
963        assert!(scanned.is_binary.is_none());
964        assert!(scanned.is_text.is_none());
965        assert!(scanned.is_archive.is_none());
966        assert!(scanned.is_media.is_none());
967        assert!(scanned.is_script.is_none());
968        assert!(scanned.is_source.is_none());
969    }
970
971    #[test]
972    fn scanner_skips_package_parsing_when_disabled() {
973        let options = TextDetectionOptions {
974            collect_info: false,
975            detect_packages: false,
976            detect_application_packages: false,
977            detect_system_packages: false,
978            detect_packages_in_compiled: false,
979            detect_copyrights: false,
980            detect_generated: false,
981            detect_emails: false,
982            detect_urls: false,
983            max_emails: 50,
984            max_urls: 50,
985            timeout_seconds: 120.0,
986        };
987        let scanned = scan_single_file(
988            "package.json",
989            r#"{"name":"demo","version":"1.0.0"}"#,
990            &options,
991        );
992
993        assert!(
994            scanned.package_data.is_empty(),
995            "package_data: {:#?}",
996            scanned.package_data
997        );
998    }
999
1000    #[test]
1001    fn scanner_parses_package_manifests_when_enabled() {
1002        let options = TextDetectionOptions {
1003            collect_info: false,
1004            detect_packages: true,
1005            detect_application_packages: true,
1006            detect_system_packages: false,
1007            detect_packages_in_compiled: false,
1008            detect_copyrights: false,
1009            detect_generated: false,
1010            detect_emails: false,
1011            detect_urls: false,
1012            max_emails: 50,
1013            max_urls: 50,
1014            timeout_seconds: 120.0,
1015        };
1016        let scanned = scan_single_file(
1017            "package.json",
1018            r#"{"name":"demo","version":"1.0.0"}"#,
1019            &options,
1020        );
1021
1022        assert_eq!(
1023            scanned.package_data.len(),
1024            1,
1025            "package_data: {:#?}",
1026            scanned.package_data
1027        );
1028    }
1029
1030    #[test]
1031    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1032        let options = TextDetectionOptions {
1033            collect_info: false,
1034            detect_packages: true,
1035            detect_application_packages: false,
1036            detect_system_packages: true,
1037            detect_packages_in_compiled: false,
1038            detect_copyrights: false,
1039            detect_generated: false,
1040            detect_emails: false,
1041            detect_urls: false,
1042            max_emails: 50,
1043            max_urls: 50,
1044            timeout_seconds: 120.0,
1045        };
1046        let scanned = scan_single_file(
1047            "package.json",
1048            r#"{"name":"demo","version":"1.0.0"}"#,
1049            &options,
1050        );
1051
1052        assert!(
1053            scanned.package_data.is_empty(),
1054            "package_data: {:#?}",
1055            scanned.package_data
1056        );
1057    }
1058
1059    #[test]
1060    fn scanner_parses_system_package_files_when_enabled() {
1061        let options = TextDetectionOptions {
1062            collect_info: false,
1063            detect_packages: true,
1064            detect_application_packages: false,
1065            detect_system_packages: true,
1066            detect_packages_in_compiled: false,
1067            detect_copyrights: false,
1068            detect_generated: false,
1069            detect_emails: false,
1070            detect_urls: false,
1071            max_emails: 50,
1072            max_urls: 50,
1073            timeout_seconds: 120.0,
1074        };
1075        let scanned = scan_file_at_relative_path(
1076            "var/lib/dpkg/status",
1077            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1078            &options,
1079        );
1080
1081        assert!(
1082            !scanned.package_data.is_empty(),
1083            "package_data: {:#?}",
1084            scanned.package_data
1085        );
1086    }
1087
1088    #[test]
1089    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1090        if std::process::Command::new("go")
1091            .arg("version")
1092            .status()
1093            .is_err()
1094        {
1095            return;
1096        }
1097
1098        let temp_dir = TempDir::new().expect("create temp dir");
1099        fs::write(
1100            temp_dir.path().join("go.mod"),
1101            "module example.com/demo\n\ngo 1.23.0\n",
1102        )
1103        .expect("write go.mod");
1104        fs::write(
1105            temp_dir.path().join("main.go"),
1106            "package main\nfunc main() {}\n",
1107        )
1108        .expect("write main.go");
1109        let file_path = temp_dir.path().join("demo");
1110        let status = std::process::Command::new("go")
1111            .current_dir(temp_dir.path())
1112            .args(["build", "-o"])
1113            .arg(&file_path)
1114            .status()
1115            .expect("run go build");
1116        assert!(status.success());
1117
1118        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1119        let collected = collect_paths(temp_dir.path(), 0, &[]);
1120
1121        let without_compiled = process_collected(
1122            &collected,
1123            Arc::clone(&progress),
1124            None,
1125            LicenseScanOptions::default(),
1126            &TextDetectionOptions {
1127                collect_info: false,
1128                detect_packages: true,
1129                detect_application_packages: true,
1130                detect_system_packages: false,
1131                detect_packages_in_compiled: false,
1132                detect_copyrights: false,
1133                detect_generated: false,
1134                detect_emails: false,
1135                detect_urls: false,
1136                max_emails: 50,
1137                max_urls: 50,
1138                timeout_seconds: 120.0,
1139            },
1140        );
1141        let with_compiled = process_collected(
1142            &collected,
1143            progress,
1144            None,
1145            LicenseScanOptions::default(),
1146            &TextDetectionOptions {
1147                collect_info: false,
1148                detect_packages: true,
1149                detect_application_packages: true,
1150                detect_system_packages: false,
1151                detect_packages_in_compiled: true,
1152                detect_copyrights: false,
1153                detect_generated: false,
1154                detect_emails: false,
1155                detect_urls: false,
1156                max_emails: 50,
1157                max_urls: 50,
1158                timeout_seconds: 120.0,
1159            },
1160        );
1161
1162        let without_compiled = without_compiled
1163            .files
1164            .into_iter()
1165            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1166            .expect("compiled artifact present");
1167        let with_compiled = with_compiled
1168            .files
1169            .into_iter()
1170            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1171            .expect("compiled artifact present");
1172
1173        assert!(
1174            without_compiled.package_data.is_empty(),
1175            "package_data: {:#?}",
1176            without_compiled.package_data
1177        );
1178        assert!(!with_compiled.package_data.is_empty());
1179    }
1180
1181    #[test]
1182    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1183        let temp_dir = TempDir::new().expect("create temp dir");
1184        let file_path = temp_dir.path().join("libiconv2.dll");
1185        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1186            .expect("read PE fixture");
1187        fs::write(&file_path, fixture).expect("write PE fixture");
1188
1189        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1190        let collected = collect_paths(temp_dir.path(), 0, &[]);
1191
1192        let without_package = process_collected(
1193            &collected,
1194            Arc::clone(&progress),
1195            None,
1196            LicenseScanOptions::default(),
1197            &TextDetectionOptions {
1198                collect_info: false,
1199                detect_packages: false,
1200                detect_application_packages: false,
1201                detect_system_packages: false,
1202                detect_packages_in_compiled: false,
1203                detect_copyrights: false,
1204                detect_generated: false,
1205                detect_emails: false,
1206                detect_urls: false,
1207                max_emails: 50,
1208                max_urls: 50,
1209                timeout_seconds: 120.0,
1210            },
1211        );
1212        let with_package = process_collected(
1213            &collected,
1214            progress,
1215            None,
1216            LicenseScanOptions::default(),
1217            &TextDetectionOptions {
1218                collect_info: false,
1219                detect_packages: true,
1220                detect_application_packages: true,
1221                detect_system_packages: false,
1222                detect_packages_in_compiled: false,
1223                detect_copyrights: false,
1224                detect_generated: false,
1225                detect_emails: false,
1226                detect_urls: false,
1227                max_emails: 50,
1228                max_urls: 50,
1229                timeout_seconds: 120.0,
1230            },
1231        );
1232
1233        let without_package = without_package
1234            .files
1235            .into_iter()
1236            .find(|entry| {
1237                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1238            })
1239            .expect("compiled artifact present");
1240        let with_package = with_package
1241            .files
1242            .into_iter()
1243            .find(|entry| {
1244                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1245            })
1246            .expect("compiled artifact present");
1247
1248        assert!(without_package.package_data.is_empty());
1249        assert_eq!(with_package.package_data.len(), 1);
1250        assert_eq!(
1251            with_package.package_data[0].package_type,
1252            Some(FilePackageType::Winexe)
1253        );
1254        assert_eq!(
1255            with_package.package_data[0].datasource_id,
1256            Some(DatasourceId::WindowsExecutable)
1257        );
1258    }
1259
1260    #[test]
1261    fn scanner_detects_license_from_font_metadata() {
1262        let temp_dir = TempDir::new().expect("create temp dir");
1263        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1264        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1265        fs::write(&file_path, fixture).expect("write font fixture");
1266
1267        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1268        let collected = collect_paths(temp_dir.path(), 0, &[]);
1269        let engine =
1270            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1271        let result = process_collected(
1272            &collected,
1273            progress,
1274            Some(engine),
1275            LicenseScanOptions::default(),
1276            &TextDetectionOptions::default(),
1277        );
1278        let scanned = result
1279            .files
1280            .into_iter()
1281            .find(|entry| {
1282                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1283            })
1284            .expect("scanned file entry");
1285
1286        assert!(
1287            scanned.license_expression.is_some(),
1288            "license detections: {:#?}",
1289            scanned.license_detections
1290        );
1291        assert!(
1292            scanned
1293                .license_expression
1294                .as_deref()
1295                .is_some_and(
1296                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1297                ),
1298            "license expression: {:?}",
1299            scanned.license_expression
1300        );
1301    }
1302
1303    #[test]
1304    fn scanner_detects_license_from_windows_executable_metadata() {
1305        let temp_dir = TempDir::new().expect("create temp dir");
1306        let file_path = temp_dir.path().join("libiconv2.dll");
1307        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1308            .expect("read PE fixture");
1309        fs::write(&file_path, fixture).expect("write PE fixture");
1310
1311        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1312        let collected = collect_paths(temp_dir.path(), 0, &[]);
1313        let engine =
1314            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1315        let result = process_collected(
1316            &collected,
1317            progress,
1318            Some(engine),
1319            LicenseScanOptions::default(),
1320            &TextDetectionOptions::default(),
1321        );
1322        let scanned = result
1323            .files
1324            .into_iter()
1325            .find(|entry| {
1326                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1327            })
1328            .expect("scanned file entry");
1329
1330        assert!(
1331            scanned.license_expression.is_some(),
1332            "license detections: {:#?}",
1333            scanned.license_detections
1334        );
1335        assert!(
1336            scanned
1337                .license_expression
1338                .as_deref()
1339                .is_some_and(|expression| {
1340                    expression.contains("lgpl") || expression.contains("LGPL")
1341                }),
1342            "license expression: {:?}",
1343            scanned.license_expression
1344        );
1345    }
1346
1347    #[test]
1348    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1349        let scanned = scan_single_file_with_license_engine(
1350            "navbar.md",
1351            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1352            &TextDetectionOptions::default(),
1353        );
1354
1355        assert!(
1356            scanned
1357                .license_expression
1358                .as_deref()
1359                .is_some_and(|expression| {
1360                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1361                }),
1362            "license expression: {:?}",
1363            scanned.license_expression
1364        );
1365    }
1366
1367    #[test]
1368    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1369        let scanned = scan_single_file_with_license_engine(
1370            "README.md",
1371            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1372            &TextDetectionOptions::default(),
1373        );
1374
1375        assert!(
1376            scanned
1377                .license_expression
1378                .as_deref()
1379                .is_some_and(|expression| {
1380                    expression.contains("mit") || expression.contains("MIT")
1381                }),
1382            "license expression: {:?}",
1383            scanned.license_expression
1384        );
1385    }
1386
1387    #[test]
1388    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1389        let scanned = scan_single_file_with_license_engine(
1390            "README.md",
1391            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1392            &TextDetectionOptions::default(),
1393        );
1394
1395        assert!(
1396            scanned
1397                .license_expression
1398                .as_deref()
1399                .is_some_and(|expression| {
1400                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1401                }),
1402            "license expression: {:?}",
1403            scanned.license_expression
1404        );
1405    }
1406
1407    #[test]
1408    fn scanner_sets_is_source_only_when_info_enabled() {
1409        let without_info = TextDetectionOptions {
1410            collect_info: false,
1411            detect_packages: false,
1412            detect_application_packages: false,
1413            detect_system_packages: false,
1414            detect_packages_in_compiled: false,
1415            detect_copyrights: false,
1416            detect_generated: false,
1417            detect_emails: false,
1418            detect_urls: false,
1419            max_emails: 50,
1420            max_urls: 50,
1421            timeout_seconds: 120.0,
1422        };
1423        let with_info = TextDetectionOptions {
1424            collect_info: true,
1425            ..without_info.clone()
1426        };
1427
1428        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1429        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1430
1431        assert_eq!(scanned_without_info.is_source, None);
1432        assert_eq!(scanned_with_info.is_source, Some(true));
1433    }
1434
1435    #[test]
1436    fn directory_omits_info_fields_when_info_disabled() {
1437        let temp_dir = TempDir::new().expect("create temp dir");
1438        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1439
1440        let collected = collect_paths(temp_dir.path(), 0, &[]);
1441        let result = process_collected(
1442            &collected,
1443            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1444            None,
1445            LicenseScanOptions::default(),
1446            &TextDetectionOptions {
1447                collect_info: false,
1448                detect_packages: false,
1449                detect_application_packages: false,
1450                detect_system_packages: false,
1451                detect_packages_in_compiled: false,
1452                detect_copyrights: false,
1453                detect_generated: false,
1454                detect_emails: false,
1455                detect_urls: false,
1456                max_emails: 50,
1457                max_urls: 50,
1458                timeout_seconds: 120.0,
1459            },
1460        );
1461
1462        let directory = result
1463            .files
1464            .into_iter()
1465            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1466            .expect("directory entry");
1467
1468        assert!(directory.date.is_none());
1469        assert!(directory.file_type_label.is_none());
1470        assert!(directory.is_binary.is_none());
1471        assert!(directory.is_text.is_none());
1472        assert!(directory.is_archive.is_none());
1473        assert!(directory.is_media.is_none());
1474        assert!(directory.is_source.is_none());
1475        assert!(directory.is_script.is_none());
1476    }
1477
1478    #[test]
1479    fn directory_includes_info_fields_when_info_enabled() {
1480        let temp_dir = TempDir::new().expect("create temp dir");
1481        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1482
1483        let collected = collect_paths(temp_dir.path(), 0, &[]);
1484        let result = process_collected(
1485            &collected,
1486            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1487            None,
1488            LicenseScanOptions::default(),
1489            &TextDetectionOptions {
1490                collect_info: true,
1491                detect_packages: false,
1492                detect_application_packages: false,
1493                detect_system_packages: false,
1494                detect_packages_in_compiled: false,
1495                detect_copyrights: false,
1496                detect_generated: false,
1497                detect_emails: false,
1498                detect_urls: false,
1499                max_emails: 50,
1500                max_urls: 50,
1501                timeout_seconds: 120.0,
1502            },
1503        );
1504
1505        let directory = result
1506            .files
1507            .into_iter()
1508            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1509            .expect("directory entry");
1510
1511        assert!(directory.date.is_none());
1512        assert!(directory.file_type_label.is_none());
1513        assert_eq!(directory.is_binary, Some(false));
1514        assert_eq!(directory.is_text, Some(false));
1515        assert_eq!(directory.is_archive, Some(false));
1516        assert_eq!(directory.is_media, Some(false));
1517        assert_eq!(directory.is_source, Some(false));
1518        assert_eq!(directory.is_script, Some(false));
1519        assert_eq!(directory.files_count, Some(0));
1520        assert_eq!(directory.dirs_count, Some(0));
1521        assert_eq!(directory.size_count, Some(0));
1522    }
1523
1524    #[test]
1525    fn collect_paths_includes_root_directory_entry() {
1526        let temp_dir = TempDir::new().expect("create temp dir");
1527        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1528        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1529            .expect("write nested file");
1530
1531        let collected = collect_paths(temp_dir.path(), 0, &[]);
1532
1533        assert!(
1534            collected
1535                .directories
1536                .iter()
1537                .any(|(path, _)| path == temp_dir.path())
1538        );
1539    }
1540
1541    #[test]
1542    fn collect_paths_supports_single_file_input() {
1543        let temp_dir = TempDir::new().expect("create temp dir");
1544        let file_path = temp_dir.path().join("main.rs");
1545        fs::write(&file_path, "fn main() {}\n").expect("write file");
1546
1547        let collected = collect_paths(&file_path, 0, &[]);
1548
1549        assert_eq!(collected.files.len(), 1);
1550        assert!(collected.directories.is_empty());
1551        assert_eq!(collected.files[0].0, file_path);
1552    }
1553
1554    #[test]
1555    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1556        let temp_dir = TempDir::new().expect("create temp dir");
1557        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1558        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1559
1560        let collected = collect_paths(temp_dir.path(), 0, &[]);
1561        let result = process_collected_with_memory_limit(
1562            &collected,
1563            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1564            None,
1565            LicenseScanOptions::default(),
1566            &TextDetectionOptions {
1567                collect_info: false,
1568                detect_packages: false,
1569                detect_application_packages: false,
1570                detect_system_packages: false,
1571                detect_packages_in_compiled: false,
1572                detect_copyrights: false,
1573                detect_generated: false,
1574                detect_emails: false,
1575                detect_urls: false,
1576                max_emails: 50,
1577                max_urls: 50,
1578                timeout_seconds: 120.0,
1579            },
1580            MemoryMode::Limit(1),
1581        );
1582
1583        assert_eq!(result.files.len(), 3);
1584    }
1585
1586    #[test]
1587    fn process_collected_with_negative_one_uses_disk_only_mode() {
1588        let temp_dir = TempDir::new().expect("create temp dir");
1589        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1590
1591        let collected = collect_paths(temp_dir.path(), 0, &[]);
1592        let result = process_collected_with_memory_limit(
1593            &collected,
1594            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1595            None,
1596            LicenseScanOptions::default(),
1597            &TextDetectionOptions {
1598                collect_info: false,
1599                detect_packages: false,
1600                detect_application_packages: false,
1601                detect_system_packages: false,
1602                detect_packages_in_compiled: false,
1603                detect_copyrights: false,
1604                detect_generated: false,
1605                detect_emails: false,
1606                detect_urls: false,
1607                max_emails: 50,
1608                max_urls: 50,
1609                timeout_seconds: 120.0,
1610            },
1611            MemoryMode::StreamUnlimited,
1612        );
1613
1614        assert_eq!(result.files.len(), 2);
1615    }
1616}