Skip to main content

provenant/scanner/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod collect;
5pub(crate) mod process;
6
7use crate::license_detection::LicenseDetectionEngine;
8use crate::models::FileInfo;
9
10pub struct ProcessResult {
11    pub files: Vec<FileInfo>,
12    pub excluded_count: usize,
13}
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct LicenseScanOptions {
17    pub include_text: bool,
18    pub include_text_diagnostics: bool,
19    pub include_diagnostics: bool,
20    pub unknown_licenses: bool,
21    pub min_score: u8,
22}
23
24#[derive(Debug, Clone)]
25pub struct TextDetectionOptions {
26    pub collect_info: bool,
27    pub detect_packages: bool,
28    pub detect_application_packages: bool,
29    pub detect_system_packages: bool,
30    pub detect_packages_in_compiled: bool,
31    pub detect_copyrights: bool,
32    pub detect_generated: bool,
33    pub detect_emails: bool,
34    pub detect_urls: bool,
35    pub max_emails: usize,
36    pub max_urls: usize,
37    pub timeout_seconds: f64,
38}
39
40impl Default for TextDetectionOptions {
41    fn default() -> Self {
42        Self {
43            collect_info: false,
44            detect_packages: false,
45            detect_application_packages: false,
46            detect_system_packages: false,
47            detect_packages_in_compiled: false,
48            detect_copyrights: true,
49            detect_generated: false,
50            detect_emails: false,
51            detect_urls: false,
52            max_emails: 50,
53            max_urls: 50,
54            timeout_seconds: 120.0,
55        }
56    }
57}
58
59pub fn scan_options_fingerprint(
60    text_options: &TextDetectionOptions,
61    license_options: LicenseScanOptions,
62    license_engine: Option<&LicenseDetectionEngine>,
63) -> String {
64    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
65        Some(engine) => {
66            let rules = &engine.index().rules_by_rid;
67            (
68                true,
69                rules.len(),
70                rules
71                    .first()
72                    .map(|rule| rule.identifier.as_str())
73                    .unwrap_or(""),
74                rules
75                    .last()
76                    .map(|rule| rule.identifier.as_str())
77                    .unwrap_or(""),
78            )
79        }
80        None => (false, 0, "", ""),
81    };
82
83    format!(
84        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
85        crate::version::BUILD_VERSION,
86        text_options.collect_info,
87        text_options.detect_packages,
88        text_options.detect_application_packages,
89        text_options.detect_system_packages,
90        text_options.detect_packages_in_compiled,
91        text_options.detect_copyrights,
92        text_options.detect_generated,
93        text_options.detect_emails,
94        text_options.detect_urls,
95        text_options.max_emails,
96        text_options.max_urls,
97        text_options.timeout_seconds,
98        license_enabled,
99        rules_count,
100        first_rule_id,
101        last_rule_id,
102        license_options.include_text,
103        license_options.include_text_diagnostics,
104        license_options.include_diagnostics,
105        license_options.unknown_licenses,
106        license_options.min_score,
107    )
108}
109
110pub use self::collect::{
111    CollectedPaths, CollectionFrontier, collect_paths, collect_selected_paths,
112};
113#[allow(unused_imports)]
114pub use self::process::{
115    MemoryMode, process_collected, process_collected_sequential,
116    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
117};
118
119#[cfg(test)]
120mod tests {
121    use std::fs;
122    use std::path::PathBuf;
123    use std::sync::Arc;
124
125    use object::pe;
126    use tempfile::TempDir;
127
128    use crate::cache::build_collection_exclude_patterns;
129    use crate::license_detection::LicenseDetectionEngine;
130    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
131    use crate::progress::{ProgressMode, ScanProgress};
132
133    use super::{
134        CollectionFrontier, LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths,
135        collect_selected_paths, process_collected, process_collected_with_memory_limit,
136        scan_options_fingerprint,
137    };
138
139    fn build_sparse_oversized_rpm_with_filename(
140        temp_dir: &TempDir,
141        package_name: &str,
142        filename: &str,
143    ) -> PathBuf {
144        let file_path = temp_dir.path().join(filename);
145        rpm::PackageBuilder::new(package_name, "1.0", "MIT", "x86_64", "Demo RPM package")
146            .release("1")
147            .build()
148            .expect("build rpm fixture")
149            .write_file(&file_path)
150            .expect("write rpm fixture");
151        fs::OpenOptions::new()
152            .write(true)
153            .open(&file_path)
154            .expect("open rpm fixture for sparse extension")
155            .set_len(100 * 1024 * 1024 + 1_048_576)
156            .expect("extend rpm fixture");
157        file_path
158    }
159
160    fn build_sparse_oversized_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
161        build_sparse_oversized_rpm_with_filename(
162            temp_dir,
163            name,
164            &format!("{name}-1.0-1.x86_64.rpm"),
165        )
166    }
167
168    fn build_sparse_oversized_pack_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
169        build_sparse_oversized_rpm_with_filename(
170            temp_dir,
171            name,
172            &format!("{name}-1.0-1.x86_64.pack"),
173        )
174    }
175
176    #[test]
177    fn default_options_keep_copyright_detection_enabled() {
178        let options = TextDetectionOptions::default();
179        assert!(!options.detect_packages);
180        assert!(options.detect_copyrights);
181    }
182
183    #[test]
184    fn test_scan_options_fingerprint_changes_with_license_score() {
185        let text_options = TextDetectionOptions::default();
186        let default_fingerprint = scan_options_fingerprint(
187            &text_options,
188            LicenseScanOptions {
189                min_score: 0,
190                ..LicenseScanOptions::default()
191            },
192            None,
193        );
194        let filtered_fingerprint = scan_options_fingerprint(
195            &text_options,
196            LicenseScanOptions {
197                min_score: 70,
198                ..LicenseScanOptions::default()
199            },
200            None,
201        );
202
203        assert_ne!(default_fingerprint, filtered_fingerprint);
204    }
205
206    fn scan_single_file(
207        file_name: &str,
208        content: &str,
209        options: &TextDetectionOptions,
210    ) -> crate::models::FileInfo {
211        let temp_dir = TempDir::new().expect("create temp dir");
212        let file_path = temp_dir.path().join(file_name);
213        fs::write(&file_path, content).expect("write test file");
214
215        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
216        let collected = collect_paths(temp_dir.path(), 0, &[]);
217        let result = process_collected(
218            &collected,
219            progress,
220            None,
221            LicenseScanOptions::default(),
222            options,
223        );
224
225        result
226            .files
227            .into_iter()
228            .find(|entry| {
229                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
230            })
231            .expect("scanned file entry")
232    }
233
234    fn scan_file_at_relative_path(
235        relative_path: &str,
236        content: &[u8],
237        options: &TextDetectionOptions,
238    ) -> crate::models::FileInfo {
239        let temp_dir = TempDir::new().expect("create temp dir");
240        let file_path = temp_dir.path().join(relative_path);
241        if let Some(parent) = file_path.parent() {
242            fs::create_dir_all(parent).expect("create parent dirs");
243        }
244        fs::write(&file_path, content).expect("write test file");
245
246        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
247        let collected = collect_paths(temp_dir.path(), 0, &[]);
248        let result = process_collected(
249            &collected,
250            progress,
251            None,
252            LicenseScanOptions::default(),
253            options,
254        );
255
256        result
257            .files
258            .into_iter()
259            .find(|entry| {
260                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
261            })
262            .expect("scanned file entry")
263    }
264
265    fn scan_single_file_with_license_engine(
266        file_name: &str,
267        content: &str,
268        options: &TextDetectionOptions,
269    ) -> crate::models::FileInfo {
270        let temp_dir = TempDir::new().expect("create temp dir");
271        let file_path = temp_dir.path().join(file_name);
272        fs::write(&file_path, content).expect("write test file");
273
274        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
275        let collected = collect_paths(temp_dir.path(), 0, &[]);
276        let engine =
277            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
278        let result = process_collected(
279            &collected,
280            progress,
281            Some(engine),
282            LicenseScanOptions::default(),
283            options,
284        );
285
286        result
287            .files
288            .into_iter()
289            .find(|entry| {
290                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
291            })
292            .expect("scanned file entry")
293    }
294
295    #[test]
296    fn scanner_reports_repeated_email_occurrences() {
297        let options = TextDetectionOptions {
298            collect_info: false,
299            detect_packages: false,
300            detect_application_packages: false,
301            detect_system_packages: false,
302            detect_packages_in_compiled: false,
303            detect_copyrights: false,
304            detect_generated: false,
305            detect_emails: true,
306            detect_urls: false,
307            max_emails: 50,
308            max_urls: 50,
309            timeout_seconds: 120.0,
310        };
311        let scanned = scan_single_file(
312            "contacts.txt",
313            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
314            &options,
315        );
316
317        let emails: Vec<(&str, usize)> = scanned
318            .emails
319            .iter()
320            .map(|email| (email.email.as_str(), email.start_line.get()))
321            .collect();
322
323        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
324        assert_eq!(
325            emails,
326            vec![
327                ("linux@3ware.com", 1),
328                ("linux@3ware.com", 2),
329                ("andre@suse.com", 3),
330                ("linux@3ware.com", 4),
331            ]
332        );
333    }
334
335    #[test]
336    fn scanner_skips_pem_certificate_text_detection() {
337        let options = TextDetectionOptions {
338            collect_info: false,
339            detect_packages: false,
340            detect_application_packages: false,
341            detect_system_packages: false,
342            detect_packages_in_compiled: false,
343            detect_copyrights: true,
344            detect_generated: false,
345            detect_emails: true,
346            detect_urls: true,
347            max_emails: 50,
348            max_urls: 50,
349            timeout_seconds: 120.0,
350        };
351        let pem_fixture = concat!(
352            "-----BEGIN CERTIFICATE-----\n",
353            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
354            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
355            "-----END CERTIFICATE-----\n",
356            "Certificate:\n",
357            "    Data:\n",
358            "        Signature Algorithm: sha1WithRSAEncryption\n",
359            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
360            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
361            "        Contact: cert-owner@example.com\n",
362        );
363        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
364
365        assert!(
366            scanned.copyrights.is_empty(),
367            "copyrights: {:#?}",
368            scanned.copyrights
369        );
370        assert!(
371            scanned.holders.is_empty(),
372            "holders: {:#?}",
373            scanned.holders
374        );
375        assert!(
376            scanned.authors.is_empty(),
377            "authors: {:#?}",
378            scanned.authors
379        );
380        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
381        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
382        assert!(
383            scanned.license_detections.is_empty(),
384            "licenses: {:#?}",
385            scanned.license_detections
386        );
387        assert!(
388            scanned.license_clues.is_empty(),
389            "license clues: {:#?}",
390            scanned.license_clues
391        );
392    }
393
394    #[test]
395    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
396        let options = TextDetectionOptions {
397            collect_info: false,
398            detect_packages: false,
399            detect_application_packages: false,
400            detect_system_packages: false,
401            detect_packages_in_compiled: false,
402            detect_copyrights: true,
403            detect_generated: false,
404            detect_emails: false,
405            detect_urls: true,
406            max_emails: 50,
407            max_urls: 50,
408            timeout_seconds: 120.0,
409        };
410        let fixture = concat!(
411            "/*\n",
412            "Copyright 2022 The Kubernetes Authors.\n\n",
413            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
414            "you may not use this file except in compliance with the License.\n",
415            "You may obtain a copy of the License at\n\n",
416            "    http://www.apache.org/licenses/LICENSE-2.0\n",
417            "*/\n\n",
418            "package storage\n\n",
419            "const validCert = `\n",
420            "-----BEGIN CERTIFICATE-----\n",
421            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
422            "-----END CERTIFICATE-----\n",
423            "`\n",
424        );
425        let temp_dir = TempDir::new().expect("create temp dir");
426        let file_path = temp_dir.path().join("storage_test.go");
427        fs::write(&file_path, fixture).expect("write fixture");
428
429        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
430        let collected = collect_paths(temp_dir.path(), 0, &[]);
431        let engine =
432            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
433        let result = process_collected(
434            &collected,
435            progress,
436            Some(engine),
437            LicenseScanOptions::default(),
438            &options,
439        );
440        let scanned = result
441            .files
442            .into_iter()
443            .find(|entry| {
444                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
445            })
446            .expect("scanned file entry");
447
448        assert!(
449            scanned
450                .copyrights
451                .iter()
452                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors."),
453            "copyrights: {:#?}",
454            scanned.copyrights
455        );
456        assert!(
457            scanned
458                .holders
459                .iter()
460                .any(|h| h.holder == "The Kubernetes Authors"),
461            "holders: {:#?}",
462            scanned.holders
463        );
464        assert!(
465            scanned
466                .urls
467                .iter()
468                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
469            "urls: {:#?}",
470            scanned.urls
471        );
472        assert_eq!(
473            scanned.detected_license_expression.as_deref(),
474            Some("Apache-2.0")
475        );
476    }
477
478    #[test]
479    fn scanner_detects_structured_credits_authors() {
480        let options = TextDetectionOptions {
481            collect_info: false,
482            detect_packages: false,
483            detect_application_packages: false,
484            detect_system_packages: false,
485            detect_packages_in_compiled: false,
486            detect_copyrights: true,
487            detect_generated: false,
488            detect_emails: false,
489            detect_urls: false,
490            max_emails: 50,
491            max_urls: 50,
492            timeout_seconds: 120.0,
493        };
494        let credits_fixture = concat!(
495            "N: Jack Lloyd\n",
496            "E: lloyd@randombit.net\n",
497            "W: http://www.randombit.net/\n",
498        );
499        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
500
501        let authors: Vec<(&str, usize, usize)> = scanned
502            .authors
503            .iter()
504            .map(|author| {
505                (
506                    author.author.as_str(),
507                    author.start_line.get(),
508                    author.end_line.get(),
509                )
510            })
511            .collect();
512
513        assert_eq!(
514            authors,
515            vec![(
516                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
517                1,
518                3,
519            )]
520        );
521        assert!(scanned.copyrights.is_empty());
522        assert!(scanned.holders.is_empty());
523    }
524
525    #[test]
526    fn scanner_uses_or_for_alternative_license_header() {
527        let fixture =
528            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
529        let temp_dir = TempDir::new().expect("create temp dir");
530        let file_path = temp_dir.path().join("d2s.ipp");
531        fs::write(&file_path, fixture).expect("write fixture");
532
533        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
534        let collected = collect_paths(temp_dir.path(), 0, &[]);
535        let engine =
536            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
537        let result = process_collected(
538            &collected,
539            progress,
540            Some(engine),
541            LicenseScanOptions::default(),
542            &TextDetectionOptions::default(),
543        );
544        let scanned = result
545            .files
546            .into_iter()
547            .find(|entry| {
548                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
549            })
550            .expect("scanned file entry");
551
552        assert_eq!(
553            scanned.detected_license_expression.as_deref(),
554            Some("Apache-2.0 OR BSL-1.0")
555        );
556        assert!(
557            scanned.license_clues.is_empty(),
558            "license clues: {:#?}",
559            scanned.license_clues
560        );
561        assert_eq!(
562            scanned.license_detections.len(),
563            1,
564            "detections: {:#?}",
565            scanned.license_detections
566        );
567
568        let detection = &scanned.license_detections[0];
569        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
570
571        let match_expressions: Vec<_> = detection
572            .matches
573            .iter()
574            .map(|m| m.license_expression_spdx.as_str())
575            .collect();
576        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
577    }
578
579    #[test]
580    fn scanner_sets_generated_flag_when_enabled() {
581        let options = TextDetectionOptions {
582            collect_info: false,
583            detect_packages: false,
584            detect_application_packages: false,
585            detect_system_packages: false,
586            detect_packages_in_compiled: false,
587            detect_copyrights: false,
588            detect_generated: true,
589            detect_emails: false,
590            detect_urls: false,
591            max_emails: 50,
592            max_urls: 50,
593            timeout_seconds: 120.0,
594        };
595        let scanned = scan_single_file(
596            "generated.c",
597            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
598            &options,
599        );
600
601        assert_eq!(scanned.is_generated, Some(true));
602    }
603
604    #[test]
605    fn scanner_leaves_generated_flag_unset_when_disabled() {
606        let options = TextDetectionOptions {
607            collect_info: false,
608            detect_packages: false,
609            detect_application_packages: false,
610            detect_system_packages: false,
611            detect_packages_in_compiled: false,
612            detect_copyrights: false,
613            detect_generated: false,
614            detect_emails: false,
615            detect_urls: false,
616            max_emails: 50,
617            max_urls: 50,
618            timeout_seconds: 120.0,
619        };
620        let scanned = scan_single_file(
621            "generated.c",
622            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
623            &options,
624        );
625
626        assert_eq!(scanned.is_generated, None);
627    }
628
629    #[test]
630    fn scanner_populates_info_surface_when_enabled() {
631        let options = TextDetectionOptions {
632            collect_info: true,
633            detect_packages: false,
634            detect_application_packages: false,
635            detect_system_packages: false,
636            detect_packages_in_compiled: false,
637            detect_copyrights: false,
638            detect_generated: false,
639            detect_emails: false,
640            detect_urls: false,
641            max_emails: 50,
642            max_urls: 50,
643            timeout_seconds: 120.0,
644        };
645        let scanned = scan_single_file(
646            "script.py",
647            "#!/usr/bin/env python3\nprint(\"hello\")\n",
648            &options,
649        );
650
651        assert!(scanned.sha1.is_some());
652        assert!(scanned.md5.is_some());
653        assert!(scanned.sha256.is_some());
654        assert!(scanned.sha1_git.is_some());
655        assert!(scanned.mime_type.is_some());
656        assert!(scanned.date.is_some());
657        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
658        assert_eq!(scanned.is_text, Some(true));
659        assert_eq!(scanned.is_script, Some(true));
660        assert_eq!(scanned.is_source, Some(true));
661    }
662
663    #[test]
664    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
665        let options = TextDetectionOptions {
666            collect_info: true,
667            detect_packages: false,
668            detect_application_packages: false,
669            detect_system_packages: false,
670            detect_packages_in_compiled: false,
671            detect_copyrights: false,
672            detect_generated: false,
673            detect_emails: false,
674            detect_urls: false,
675            max_emails: 50,
676            max_urls: 50,
677            timeout_seconds: 120.0,
678        };
679        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
680        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
681
682        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
683        assert_eq!(
684            scanned.file_type_label.as_deref(),
685            Some("python script, text executable")
686        );
687        assert_eq!(scanned.is_binary, Some(false));
688        assert_eq!(scanned.is_text, Some(true));
689        assert_eq!(scanned.is_script, Some(true));
690        assert_eq!(scanned.is_source, Some(true));
691    }
692
693    #[test]
694    fn scanner_skips_findings_for_zip_like_archives() {
695        let options = TextDetectionOptions {
696            collect_info: true,
697            detect_packages: false,
698            detect_application_packages: false,
699            detect_system_packages: false,
700            detect_packages_in_compiled: false,
701            detect_copyrights: true,
702            detect_generated: false,
703            detect_emails: true,
704            detect_urls: true,
705            max_emails: 50,
706            max_urls: 50,
707            timeout_seconds: 120.0,
708        };
709        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
710        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
711
712        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
713        assert_eq!(scanned.is_archive, Some(true));
714        assert!(scanned.license_detections.is_empty());
715        assert!(scanned.copyrights.is_empty());
716        assert!(scanned.emails.is_empty());
717        assert!(scanned.urls.is_empty());
718    }
719
720    #[test]
721    fn scanner_treats_typescript_sources_as_text_not_video_media() {
722        let options = TextDetectionOptions {
723            collect_info: true,
724            detect_packages: false,
725            detect_application_packages: false,
726            detect_system_packages: false,
727            detect_packages_in_compiled: false,
728            detect_copyrights: false,
729            detect_generated: false,
730            detect_emails: false,
731            detect_urls: false,
732            max_emails: 50,
733            max_urls: 50,
734            timeout_seconds: 120.0,
735        };
736        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
737
738        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
739        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
740        assert_eq!(
741            scanned.file_type_label.as_deref(),
742            Some("TypeScript source, UTF-8 Unicode text")
743        );
744        assert_eq!(scanned.is_text, Some(true));
745        assert_eq!(scanned.is_media, Some(false));
746        assert_eq!(scanned.is_script, Some(false));
747        assert_eq!(scanned.is_source, Some(true));
748    }
749
750    #[test]
751    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
752        let options = TextDetectionOptions {
753            collect_info: true,
754            detect_packages: false,
755            detect_application_packages: false,
756            detect_system_packages: false,
757            detect_packages_in_compiled: false,
758            detect_copyrights: false,
759            detect_generated: false,
760            detect_emails: false,
761            detect_urls: false,
762            max_emails: 50,
763            max_urls: 50,
764            timeout_seconds: 120.0,
765        };
766        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
767
768        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
769        assert_eq!(
770            scanned.file_type_label.as_deref(),
771            Some("TypeScript source, UTF-8 Unicode text")
772        );
773        assert_eq!(scanned.is_text, Some(true));
774        assert_eq!(scanned.is_media, Some(false));
775        assert_eq!(scanned.is_script, Some(false));
776        assert_eq!(scanned.is_source, Some(true));
777    }
778
779    #[test]
780    fn scanner_treats_empty_files_like_scancode_info_surface() {
781        let options = TextDetectionOptions {
782            collect_info: true,
783            detect_packages: false,
784            detect_application_packages: false,
785            detect_system_packages: false,
786            detect_packages_in_compiled: false,
787            detect_copyrights: false,
788            detect_generated: false,
789            detect_emails: false,
790            detect_urls: false,
791            max_emails: 50,
792            max_urls: 50,
793            timeout_seconds: 120.0,
794        };
795        let scanned = scan_single_file("test.txt", "", &options);
796
797        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
798        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
799        assert_eq!(scanned.programming_language, None);
800        assert_eq!(scanned.is_binary, Some(false));
801        assert_eq!(scanned.is_text, Some(true));
802        assert_eq!(scanned.is_archive, Some(false));
803        assert_eq!(scanned.is_media, Some(false));
804        assert_eq!(scanned.is_source, Some(false));
805        assert_eq!(scanned.is_script, Some(false));
806    }
807
808    #[test]
809    fn scanner_treats_package_json_as_text_not_source() {
810        let options = TextDetectionOptions {
811            collect_info: true,
812            detect_packages: false,
813            detect_application_packages: false,
814            detect_system_packages: false,
815            detect_packages_in_compiled: false,
816            detect_copyrights: false,
817            detect_generated: false,
818            detect_emails: false,
819            detect_urls: false,
820            max_emails: 50,
821            max_urls: 50,
822            timeout_seconds: 120.0,
823        };
824        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
825
826        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
827        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
828        assert_eq!(scanned.programming_language, None);
829        assert_eq!(scanned.is_text, Some(true));
830        assert_eq!(scanned.is_source, Some(false));
831        assert_eq!(scanned.is_script, Some(false));
832    }
833
834    #[test]
835    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
836        let options = TextDetectionOptions {
837            collect_info: true,
838            detect_packages: false,
839            detect_application_packages: false,
840            detect_system_packages: false,
841            detect_packages_in_compiled: false,
842            detect_copyrights: false,
843            detect_generated: false,
844            detect_emails: false,
845            detect_urls: false,
846            max_emails: 50,
847            max_urls: 50,
848            timeout_seconds: 120.0,
849        };
850
851        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
852        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
853
854        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
855        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
856        assert_eq!(gradle.is_source, Some(true));
857        assert_eq!(gradle.is_script, Some(false));
858
859        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
860        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
861        assert_eq!(nix.is_source, Some(true));
862        assert_eq!(nix.is_script, Some(false));
863    }
864
865    #[test]
866    fn scanner_treats_gitmodules_as_text_not_source() {
867        let options = TextDetectionOptions {
868            collect_info: true,
869            detect_packages: false,
870            detect_application_packages: false,
871            detect_system_packages: false,
872            detect_packages_in_compiled: false,
873            detect_copyrights: false,
874            detect_generated: false,
875            detect_emails: false,
876            detect_urls: false,
877            max_emails: 50,
878            max_urls: 50,
879            timeout_seconds: 120.0,
880        };
881        let scanned = scan_file_at_relative_path(
882            ".gitmodules",
883            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
884            &options,
885        );
886
887        assert_eq!(scanned.programming_language, None);
888        assert_eq!(
889            scanned.file_type_label.as_deref(),
890            Some("Git configuration text")
891        );
892        assert_eq!(scanned.is_text, Some(true));
893        assert_eq!(scanned.is_source, Some(false));
894        assert_eq!(scanned.is_script, Some(false));
895    }
896
897    #[test]
898    fn scanner_treats_javascript_shebang_files_as_scripts() {
899        let options = TextDetectionOptions {
900            collect_info: true,
901            detect_packages: false,
902            detect_application_packages: false,
903            detect_system_packages: false,
904            detect_packages_in_compiled: false,
905            detect_copyrights: false,
906            detect_generated: false,
907            detect_emails: false,
908            detect_urls: false,
909            max_emails: 50,
910            max_urls: 50,
911            timeout_seconds: 120.0,
912        };
913        let scanned = scan_file_at_relative_path(
914            "bin/run",
915            b"#!/usr/bin/env node\nconsole.log('hello');\n",
916            &options,
917        );
918
919        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
920        assert_eq!(
921            scanned.file_type_label.as_deref(),
922            Some("javascript script, UTF-8 Unicode text executable")
923        );
924        assert_eq!(scanned.is_script, Some(true));
925        assert_eq!(scanned.is_source, Some(true));
926    }
927
928    #[test]
929    fn scanner_treats_dockerfile_as_source() {
930        let options = TextDetectionOptions {
931            collect_info: true,
932            detect_packages: false,
933            detect_application_packages: false,
934            detect_system_packages: false,
935            detect_packages_in_compiled: false,
936            detect_copyrights: false,
937            detect_generated: false,
938            detect_emails: false,
939            detect_urls: false,
940            max_emails: 50,
941            max_urls: 50,
942            timeout_seconds: 120.0,
943        };
944        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
945
946        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
947        assert_eq!(
948            scanned.file_type_label.as_deref(),
949            Some("Dockerfile source, UTF-8 Unicode text")
950        );
951        assert_eq!(scanned.is_source, Some(true));
952        assert_eq!(scanned.is_script, Some(false));
953    }
954
955    #[test]
956    fn scanner_treats_makefile_as_text_not_source() {
957        let options = TextDetectionOptions {
958            collect_info: true,
959            detect_packages: false,
960            detect_application_packages: false,
961            detect_system_packages: false,
962            detect_packages_in_compiled: false,
963            detect_copyrights: false,
964            detect_generated: false,
965            detect_emails: false,
966            detect_urls: false,
967            max_emails: 50,
968            max_urls: 50,
969            timeout_seconds: 120.0,
970        };
971        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
972
973        assert_eq!(scanned.programming_language, None);
974        assert_eq!(
975            scanned.file_type_label.as_deref(),
976            Some("UTF-8 Unicode text")
977        );
978        assert_eq!(scanned.is_text, Some(true));
979        assert_eq!(scanned.is_source, Some(false));
980        assert_eq!(scanned.is_script, Some(false));
981    }
982
983    #[test]
984    fn scanner_omits_info_surface_when_disabled() {
985        let options = TextDetectionOptions {
986            collect_info: false,
987            detect_packages: false,
988            detect_application_packages: false,
989            detect_system_packages: false,
990            detect_packages_in_compiled: false,
991            detect_copyrights: false,
992            detect_generated: false,
993            detect_emails: false,
994            detect_urls: false,
995            max_emails: 50,
996            max_urls: 50,
997            timeout_seconds: 120.0,
998        };
999        let scanned = scan_single_file(
1000            "script.py",
1001            "#!/usr/bin/env python3\nprint(\"hello\")\n",
1002            &options,
1003        );
1004
1005        assert!(scanned.sha1.is_none());
1006        assert!(scanned.md5.is_none());
1007        assert!(scanned.sha256.is_none());
1008        assert!(scanned.sha1_git.is_none());
1009        assert!(scanned.mime_type.is_none());
1010        assert!(scanned.date.is_none());
1011        assert!(scanned.programming_language.is_none());
1012        assert!(scanned.is_binary.is_none());
1013        assert!(scanned.is_text.is_none());
1014        assert!(scanned.is_archive.is_none());
1015        assert!(scanned.is_media.is_none());
1016        assert!(scanned.is_script.is_none());
1017        assert!(scanned.is_source.is_none());
1018    }
1019
1020    #[test]
1021    fn scanner_skips_package_parsing_when_disabled() {
1022        let options = TextDetectionOptions {
1023            collect_info: false,
1024            detect_packages: false,
1025            detect_application_packages: false,
1026            detect_system_packages: false,
1027            detect_packages_in_compiled: false,
1028            detect_copyrights: false,
1029            detect_generated: false,
1030            detect_emails: false,
1031            detect_urls: false,
1032            max_emails: 50,
1033            max_urls: 50,
1034            timeout_seconds: 120.0,
1035        };
1036        let scanned = scan_single_file(
1037            "package.json",
1038            r#"{"name":"demo","version":"1.0.0"}"#,
1039            &options,
1040        );
1041
1042        assert!(
1043            scanned.package_data.is_empty(),
1044            "package_data: {:#?}",
1045            scanned.package_data
1046        );
1047    }
1048
1049    #[test]
1050    fn scanner_parses_package_manifests_when_enabled() {
1051        let options = TextDetectionOptions {
1052            collect_info: false,
1053            detect_packages: true,
1054            detect_application_packages: true,
1055            detect_system_packages: false,
1056            detect_packages_in_compiled: false,
1057            detect_copyrights: false,
1058            detect_generated: false,
1059            detect_emails: false,
1060            detect_urls: false,
1061            max_emails: 50,
1062            max_urls: 50,
1063            timeout_seconds: 120.0,
1064        };
1065        let scanned = scan_single_file(
1066            "package.json",
1067            r#"{"name":"demo","version":"1.0.0"}"#,
1068            &options,
1069        );
1070
1071        assert_eq!(
1072            scanned.package_data.len(),
1073            1,
1074            "package_data: {:#?}",
1075            scanned.package_data
1076        );
1077    }
1078
1079    #[test]
1080    fn scanner_parses_oversized_rpm_in_package_only_mode_without_size_warning() {
1081        let temp_dir = TempDir::new().expect("create temp dir");
1082        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-demo");
1083
1084        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1085        let collected = collect_paths(temp_dir.path(), 0, &[]);
1086        let result = process_collected(
1087            &collected,
1088            progress,
1089            None,
1090            LicenseScanOptions::default(),
1091            &TextDetectionOptions {
1092                collect_info: false,
1093                detect_packages: true,
1094                detect_application_packages: true,
1095                detect_system_packages: false,
1096                detect_packages_in_compiled: false,
1097                detect_copyrights: false,
1098                detect_generated: false,
1099                detect_emails: false,
1100                detect_urls: false,
1101                max_emails: 50,
1102                max_urls: 50,
1103                timeout_seconds: 120.0,
1104            },
1105        );
1106
1107        let scanned = result
1108            .files
1109            .into_iter()
1110            .find(|entry| {
1111                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1112            })
1113            .expect("scanned file entry");
1114
1115        assert!(
1116            scanned.scan_diagnostics.is_empty(),
1117            "scan_diagnostics: {:#?}",
1118            scanned.scan_diagnostics
1119        );
1120        assert_eq!(
1121            scanned.package_data.len(),
1122            1,
1123            "package_data: {:#?}",
1124            scanned.package_data
1125        );
1126        assert_eq!(
1127            scanned.package_data[0].datasource_id,
1128            Some(DatasourceId::RpmArchive)
1129        );
1130        assert_eq!(
1131            scanned.package_data[0].name.as_deref(),
1132            Some("oversized-demo")
1133        );
1134        assert_eq!(scanned.package_data[0].version.as_deref(), Some("1.0-1"));
1135    }
1136
1137    #[test]
1138    fn scanner_parses_oversized_rpm_with_info_without_timeout_or_size_warning() {
1139        let temp_dir = TempDir::new().expect("create temp dir");
1140        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-info-demo");
1141
1142        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1143        let collected = collect_paths(temp_dir.path(), 0, &[]);
1144        let result = process_collected(
1145            &collected,
1146            progress,
1147            None,
1148            LicenseScanOptions::default(),
1149            &TextDetectionOptions {
1150                collect_info: true,
1151                detect_packages: true,
1152                detect_application_packages: true,
1153                detect_system_packages: false,
1154                detect_packages_in_compiled: false,
1155                detect_copyrights: false,
1156                detect_generated: false,
1157                detect_emails: false,
1158                detect_urls: false,
1159                max_emails: 50,
1160                max_urls: 50,
1161                timeout_seconds: 120.0,
1162            },
1163        );
1164
1165        let scanned = result
1166            .files
1167            .into_iter()
1168            .find(|entry| {
1169                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1170            })
1171            .expect("scanned file entry");
1172
1173        assert!(
1174            scanned.scan_diagnostics.is_empty(),
1175            "scan_diagnostics: {:#?}",
1176            scanned.scan_diagnostics
1177        );
1178        assert_eq!(
1179            scanned.package_data.len(),
1180            1,
1181            "package_data: {:#?}",
1182            scanned.package_data
1183        );
1184        assert_eq!(
1185            scanned.package_data[0].datasource_id,
1186            Some(DatasourceId::RpmArchive)
1187        );
1188        assert_eq!(
1189            scanned.package_data[0].name.as_deref(),
1190            Some("oversized-info-demo")
1191        );
1192        assert!(scanned.sha1.is_some());
1193        assert!(scanned.md5.is_some());
1194        assert!(scanned.sha256.is_some());
1195        assert!(scanned.sha1_git.is_some());
1196        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1197        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1198        assert_eq!(scanned.is_binary, Some(true));
1199        assert_eq!(scanned.is_text, Some(false));
1200        assert_eq!(scanned.is_archive, Some(true));
1201    }
1202
1203    #[test]
1204    fn scanner_parses_oversized_pack_rpm_in_package_only_mode_without_size_warning() {
1205        let temp_dir = TempDir::new().expect("create temp dir");
1206        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-demo");
1207
1208        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1209        let collected = collect_paths(temp_dir.path(), 0, &[]);
1210        let result = process_collected(
1211            &collected,
1212            progress,
1213            None,
1214            LicenseScanOptions::default(),
1215            &TextDetectionOptions {
1216                collect_info: false,
1217                detect_packages: true,
1218                detect_application_packages: true,
1219                detect_system_packages: false,
1220                detect_packages_in_compiled: false,
1221                detect_copyrights: false,
1222                detect_generated: false,
1223                detect_emails: false,
1224                detect_urls: false,
1225                max_emails: 50,
1226                max_urls: 50,
1227                timeout_seconds: 120.0,
1228            },
1229        );
1230
1231        let scanned = result
1232            .files
1233            .into_iter()
1234            .find(|entry| {
1235                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1236            })
1237            .expect("scanned file entry");
1238
1239        assert!(
1240            scanned.scan_diagnostics.is_empty(),
1241            "scan_diagnostics: {:#?}",
1242            scanned.scan_diagnostics
1243        );
1244        assert_eq!(
1245            scanned.package_data.len(),
1246            1,
1247            "package_data: {:#?}",
1248            scanned.package_data
1249        );
1250        assert_eq!(
1251            scanned.package_data[0].datasource_id,
1252            Some(DatasourceId::RpmArchive)
1253        );
1254        assert_eq!(
1255            scanned.package_data[0].name.as_deref(),
1256            Some("oversized-pack-demo")
1257        );
1258    }
1259
1260    #[test]
1261    fn scanner_parses_oversized_pack_rpm_with_info_without_timeout_or_size_warning() {
1262        let temp_dir = TempDir::new().expect("create temp dir");
1263        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-info-demo");
1264
1265        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1266        let collected = collect_paths(temp_dir.path(), 0, &[]);
1267        let result = process_collected(
1268            &collected,
1269            progress,
1270            None,
1271            LicenseScanOptions::default(),
1272            &TextDetectionOptions {
1273                collect_info: true,
1274                detect_packages: true,
1275                detect_application_packages: true,
1276                detect_system_packages: false,
1277                detect_packages_in_compiled: false,
1278                detect_copyrights: false,
1279                detect_generated: false,
1280                detect_emails: false,
1281                detect_urls: false,
1282                max_emails: 50,
1283                max_urls: 50,
1284                timeout_seconds: 120.0,
1285            },
1286        );
1287
1288        let scanned = result
1289            .files
1290            .into_iter()
1291            .find(|entry| {
1292                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1293            })
1294            .expect("scanned file entry");
1295
1296        assert!(
1297            scanned.scan_diagnostics.is_empty(),
1298            "scan_diagnostics: {:#?}",
1299            scanned.scan_diagnostics
1300        );
1301        assert_eq!(
1302            scanned.package_data.len(),
1303            1,
1304            "package_data: {:#?}",
1305            scanned.package_data
1306        );
1307        assert_eq!(
1308            scanned.package_data[0].datasource_id,
1309            Some(DatasourceId::RpmArchive)
1310        );
1311        assert_eq!(
1312            scanned.package_data[0].name.as_deref(),
1313            Some("oversized-pack-info-demo")
1314        );
1315        assert!(scanned.sha1.is_some());
1316        assert!(scanned.md5.is_some());
1317        assert!(scanned.sha256.is_some());
1318        assert!(scanned.sha1_git.is_some());
1319        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1320        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1321        assert_eq!(scanned.is_binary, Some(true));
1322        assert_eq!(scanned.is_text, Some(false));
1323        assert_eq!(scanned.is_archive, Some(true));
1324    }
1325
1326    #[test]
1327    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1328        let options = TextDetectionOptions {
1329            collect_info: false,
1330            detect_packages: true,
1331            detect_application_packages: false,
1332            detect_system_packages: true,
1333            detect_packages_in_compiled: false,
1334            detect_copyrights: false,
1335            detect_generated: false,
1336            detect_emails: false,
1337            detect_urls: false,
1338            max_emails: 50,
1339            max_urls: 50,
1340            timeout_seconds: 120.0,
1341        };
1342        let scanned = scan_single_file(
1343            "package.json",
1344            r#"{"name":"demo","version":"1.0.0"}"#,
1345            &options,
1346        );
1347
1348        assert!(
1349            scanned.package_data.is_empty(),
1350            "package_data: {:#?}",
1351            scanned.package_data
1352        );
1353    }
1354
1355    #[test]
1356    fn scanner_parses_system_package_files_when_enabled() {
1357        let options = TextDetectionOptions {
1358            collect_info: false,
1359            detect_packages: true,
1360            detect_application_packages: false,
1361            detect_system_packages: true,
1362            detect_packages_in_compiled: false,
1363            detect_copyrights: false,
1364            detect_generated: false,
1365            detect_emails: false,
1366            detect_urls: false,
1367            max_emails: 50,
1368            max_urls: 50,
1369            timeout_seconds: 120.0,
1370        };
1371        let scanned = scan_file_at_relative_path(
1372            "var/lib/dpkg/status",
1373            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1374            &options,
1375        );
1376
1377        assert!(
1378            !scanned.package_data.is_empty(),
1379            "package_data: {:#?}",
1380            scanned.package_data
1381        );
1382    }
1383
1384    #[test]
1385    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1386        if std::process::Command::new("go")
1387            .arg("version")
1388            .status()
1389            .is_err()
1390        {
1391            return;
1392        }
1393
1394        let temp_dir = TempDir::new().expect("create temp dir");
1395        fs::write(
1396            temp_dir.path().join("go.mod"),
1397            "module example.com/demo\n\ngo 1.23.0\n",
1398        )
1399        .expect("write go.mod");
1400        fs::write(
1401            temp_dir.path().join("main.go"),
1402            "package main\nfunc main() {}\n",
1403        )
1404        .expect("write main.go");
1405        let file_path = temp_dir.path().join("demo");
1406        let status = std::process::Command::new("go")
1407            .current_dir(temp_dir.path())
1408            .args(["build", "-o"])
1409            .arg(&file_path)
1410            .status()
1411            .expect("run go build");
1412        assert!(status.success());
1413
1414        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1415        let collected = collect_paths(temp_dir.path(), 0, &[]);
1416
1417        let without_compiled = process_collected(
1418            &collected,
1419            Arc::clone(&progress),
1420            None,
1421            LicenseScanOptions::default(),
1422            &TextDetectionOptions {
1423                collect_info: false,
1424                detect_packages: true,
1425                detect_application_packages: true,
1426                detect_system_packages: false,
1427                detect_packages_in_compiled: false,
1428                detect_copyrights: false,
1429                detect_generated: false,
1430                detect_emails: false,
1431                detect_urls: false,
1432                max_emails: 50,
1433                max_urls: 50,
1434                timeout_seconds: 120.0,
1435            },
1436        );
1437        let with_compiled = process_collected(
1438            &collected,
1439            progress,
1440            None,
1441            LicenseScanOptions::default(),
1442            &TextDetectionOptions {
1443                collect_info: false,
1444                detect_packages: true,
1445                detect_application_packages: true,
1446                detect_system_packages: false,
1447                detect_packages_in_compiled: true,
1448                detect_copyrights: false,
1449                detect_generated: false,
1450                detect_emails: false,
1451                detect_urls: false,
1452                max_emails: 50,
1453                max_urls: 50,
1454                timeout_seconds: 120.0,
1455            },
1456        );
1457
1458        let without_compiled = without_compiled
1459            .files
1460            .into_iter()
1461            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1462            .expect("compiled artifact present");
1463        let with_compiled = with_compiled
1464            .files
1465            .into_iter()
1466            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1467            .expect("compiled artifact present");
1468
1469        assert!(
1470            without_compiled.package_data.is_empty(),
1471            "package_data: {:#?}",
1472            without_compiled.package_data
1473        );
1474        assert!(!with_compiled.package_data.is_empty());
1475    }
1476
1477    #[test]
1478    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1479        let temp_dir = TempDir::new().expect("create temp dir");
1480        let file_path = temp_dir.path().join("libiconv2.dll");
1481        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1482            .expect("read PE fixture");
1483        fs::write(&file_path, fixture).expect("write PE fixture");
1484
1485        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1486        let collected = collect_paths(temp_dir.path(), 0, &[]);
1487
1488        let without_package = process_collected(
1489            &collected,
1490            Arc::clone(&progress),
1491            None,
1492            LicenseScanOptions::default(),
1493            &TextDetectionOptions {
1494                collect_info: false,
1495                detect_packages: false,
1496                detect_application_packages: false,
1497                detect_system_packages: false,
1498                detect_packages_in_compiled: false,
1499                detect_copyrights: false,
1500                detect_generated: false,
1501                detect_emails: false,
1502                detect_urls: false,
1503                max_emails: 50,
1504                max_urls: 50,
1505                timeout_seconds: 120.0,
1506            },
1507        );
1508        let with_package = process_collected(
1509            &collected,
1510            progress,
1511            None,
1512            LicenseScanOptions::default(),
1513            &TextDetectionOptions {
1514                collect_info: false,
1515                detect_packages: true,
1516                detect_application_packages: true,
1517                detect_system_packages: false,
1518                detect_packages_in_compiled: false,
1519                detect_copyrights: false,
1520                detect_generated: false,
1521                detect_emails: false,
1522                detect_urls: false,
1523                max_emails: 50,
1524                max_urls: 50,
1525                timeout_seconds: 120.0,
1526            },
1527        );
1528
1529        let without_package = without_package
1530            .files
1531            .into_iter()
1532            .find(|entry| {
1533                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1534            })
1535            .expect("compiled artifact present");
1536        let with_package = with_package
1537            .files
1538            .into_iter()
1539            .find(|entry| {
1540                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1541            })
1542            .expect("compiled artifact present");
1543
1544        assert!(without_package.package_data.is_empty());
1545        assert_eq!(with_package.package_data.len(), 1);
1546        assert_eq!(
1547            with_package.package_data[0].package_type,
1548            Some(FilePackageType::Winexe)
1549        );
1550        assert_eq!(
1551            with_package.package_data[0].datasource_id,
1552            Some(DatasourceId::WindowsExecutable)
1553        );
1554    }
1555
1556    #[test]
1557    fn scanner_keeps_nsis_and_windows_executable_package_data_together() {
1558        let temp_dir = TempDir::new().expect("create temp dir");
1559        let file_path = temp_dir.path().join("nsis-with-version.exe");
1560        let mut fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1561            .expect("read PE fixture");
1562        if fixture.len() < 70_000 {
1563            fixture.resize(70_000, 0);
1564        }
1565        fixture.extend_from_slice(b"Nullsoft.NSIS.exehead");
1566        fs::write(&file_path, fixture).expect("write synthetic NSIS PE fixture");
1567
1568        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1569        let collected = collect_paths(temp_dir.path(), 0, &[]);
1570        let result = process_collected(
1571            &collected,
1572            progress,
1573            None,
1574            LicenseScanOptions::default(),
1575            &TextDetectionOptions {
1576                collect_info: false,
1577                detect_packages: true,
1578                detect_application_packages: true,
1579                detect_system_packages: false,
1580                detect_packages_in_compiled: false,
1581                detect_copyrights: false,
1582                detect_generated: false,
1583                detect_emails: false,
1584                detect_urls: false,
1585                max_emails: 50,
1586                max_urls: 50,
1587                timeout_seconds: 120.0,
1588            },
1589        );
1590
1591        let scanned = result
1592            .files
1593            .into_iter()
1594            .find(|entry| {
1595                entry.file_type == FileType::File && entry.path.ends_with("/nsis-with-version.exe")
1596            })
1597            .expect("compiled artifact present");
1598
1599        assert_eq!(
1600            scanned.package_data.len(),
1601            2,
1602            "package_data: {:#?}",
1603            scanned.package_data
1604        );
1605        assert!(
1606            scanned
1607                .package_data
1608                .iter()
1609                .any(|pkg| pkg.datasource_id == Some(DatasourceId::NsisInstaller))
1610        );
1611        assert!(
1612            scanned
1613                .package_data
1614                .iter()
1615                .any(|pkg| pkg.datasource_id == Some(DatasourceId::WindowsExecutable))
1616        );
1617    }
1618
1619    #[test]
1620    fn scanner_detects_license_from_font_metadata() {
1621        let temp_dir = TempDir::new().expect("create temp dir");
1622        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1623        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1624        fs::write(&file_path, fixture).expect("write font fixture");
1625
1626        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1627        let collected = collect_paths(temp_dir.path(), 0, &[]);
1628        let engine =
1629            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1630        let result = process_collected(
1631            &collected,
1632            progress,
1633            Some(engine),
1634            LicenseScanOptions::default(),
1635            &TextDetectionOptions::default(),
1636        );
1637        let scanned = result
1638            .files
1639            .into_iter()
1640            .find(|entry| {
1641                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1642            })
1643            .expect("scanned file entry");
1644
1645        assert!(
1646            scanned.detected_license_expression.is_some(),
1647            "license detections: {:#?}",
1648            scanned.license_detections
1649        );
1650        assert!(
1651            scanned
1652                .detected_license_expression
1653                .as_deref()
1654                .is_some_and(
1655                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1656                ),
1657            "license expression: {:?}",
1658            scanned.detected_license_expression
1659        );
1660    }
1661
1662    #[test]
1663    fn scanner_detects_license_from_windows_executable_metadata() {
1664        let temp_dir = TempDir::new().expect("create temp dir");
1665        let file_path = temp_dir.path().join("libiconv2.dll");
1666        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1667            .expect("read PE fixture");
1668        fs::write(&file_path, fixture).expect("write PE fixture");
1669
1670        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1671        let collected = collect_paths(temp_dir.path(), 0, &[]);
1672        let engine =
1673            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1674        let result = process_collected(
1675            &collected,
1676            progress,
1677            Some(engine),
1678            LicenseScanOptions::default(),
1679            &TextDetectionOptions::default(),
1680        );
1681        let scanned = result
1682            .files
1683            .into_iter()
1684            .find(|entry| {
1685                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1686            })
1687            .expect("scanned file entry");
1688
1689        assert!(
1690            scanned.detected_license_expression.is_some(),
1691            "license detections: {:#?}",
1692            scanned.license_detections
1693        );
1694        assert!(
1695            scanned
1696                .detected_license_expression
1697                .as_deref()
1698                .is_some_and(|expression| {
1699                    expression.contains("lgpl") || expression.contains("LGPL")
1700                }),
1701            "license expression: {:?}",
1702            scanned.detected_license_expression
1703        );
1704    }
1705
1706    #[test]
1707    fn scanner_detects_license_from_windows_executable_security_notice() {
1708        fn synthetic_pe_with_security_notice(notice: &str) -> Vec<u8> {
1709            let cert_payload = notice
1710                .encode_utf16()
1711                .flat_map(|unit| unit.to_le_bytes())
1712                .collect::<Vec<_>>();
1713            let cert_len = (8 + cert_payload.len()) as u32;
1714            let mut cert = Vec::new();
1715            cert.extend_from_slice(&cert_len.to_le_bytes());
1716            cert.extend_from_slice(&0x0200u16.to_le_bytes());
1717            cert.extend_from_slice(&0x0002u16.to_le_bytes());
1718            cert.extend_from_slice(&cert_payload);
1719            while !cert.len().is_multiple_of(8) {
1720                cert.push(0);
1721            }
1722
1723            let offset = 0x200usize;
1724            let size = cert.len();
1725            let optional_header_size = 224usize;
1726            let pe_header_offset = 0x80usize;
1727            let nt_headers_offset = pe_header_offset + 4;
1728            let optional_header_offset = nt_headers_offset + 20;
1729            let data_directory_offset = optional_header_offset + 96;
1730            let security_directory_offset =
1731                data_directory_offset + pe::IMAGE_DIRECTORY_ENTRY_SECURITY * 8;
1732            let total_len = offset + size;
1733            let mut bytes = vec![0u8; total_len];
1734
1735            bytes[0..2].copy_from_slice(b"MZ");
1736            bytes[0x3c..0x40].copy_from_slice(&(pe_header_offset as u32).to_le_bytes());
1737            bytes[pe_header_offset..pe_header_offset + 4].copy_from_slice(b"PE\0\0");
1738
1739            bytes[nt_headers_offset..nt_headers_offset + 2]
1740                .copy_from_slice(&0x014cu16.to_le_bytes());
1741            bytes[nt_headers_offset + 16..nt_headers_offset + 18]
1742                .copy_from_slice(&(optional_header_size as u16).to_le_bytes());
1743
1744            bytes[optional_header_offset..optional_header_offset + 2]
1745                .copy_from_slice(&0x010bu16.to_le_bytes());
1746            bytes[optional_header_offset + 92..optional_header_offset + 96]
1747                .copy_from_slice(&16u32.to_le_bytes());
1748            bytes[security_directory_offset..security_directory_offset + 4]
1749                .copy_from_slice(&(offset as u32).to_le_bytes());
1750            bytes[security_directory_offset + 4..security_directory_offset + 8]
1751                .copy_from_slice(&(size as u32).to_le_bytes());
1752            bytes[offset..offset + size].copy_from_slice(&cert);
1753
1754            bytes
1755        }
1756
1757        let temp_dir = TempDir::new().expect("create temp dir");
1758        let file_path = temp_dir.path().join("signed.dll");
1759        let fixture = synthetic_pe_with_security_notice(
1760            "use of this Certificate constitutes acceptance of the DigiCert CP/CPS and the Relying Party Agreement which limit liability and are incorporated herein by reference.",
1761        );
1762        fs::write(&file_path, fixture).expect("write PE fixture");
1763
1764        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1765        let collected = collect_paths(temp_dir.path(), 0, &[]);
1766        let engine =
1767            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1768        let result = process_collected(
1769            &collected,
1770            progress,
1771            Some(engine),
1772            LicenseScanOptions::default(),
1773            &TextDetectionOptions::default(),
1774        );
1775        let scanned = result
1776            .files
1777            .into_iter()
1778            .find(|entry| {
1779                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1780            })
1781            .expect("scanned file entry");
1782
1783        assert!(
1784            scanned
1785                .detected_license_expression
1786                .as_deref()
1787                .is_some_and(|expression| expression.contains("proprietary-license")),
1788            "license expression: {:?}, detections: {:#?}",
1789            scanned.detected_license_expression,
1790            scanned.license_detections
1791        );
1792    }
1793
1794    #[test]
1795    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1796        let scanned = scan_single_file_with_license_engine(
1797            "navbar.md",
1798            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1799            &TextDetectionOptions::default(),
1800        );
1801
1802        assert!(
1803            scanned
1804                .detected_license_expression
1805                .as_deref()
1806                .is_some_and(|expression| {
1807                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1808                }),
1809            "license expression: {:?}",
1810            scanned.detected_license_expression
1811        );
1812    }
1813
1814    #[test]
1815    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1816        let scanned = scan_single_file_with_license_engine(
1817            "README.md",
1818            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1819            &TextDetectionOptions::default(),
1820        );
1821
1822        assert!(
1823            scanned
1824                .detected_license_expression
1825                .as_deref()
1826                .is_some_and(|expression| {
1827                    expression.contains("mit") || expression.contains("MIT")
1828                }),
1829            "license expression: {:?}",
1830            scanned.detected_license_expression
1831        );
1832    }
1833
1834    #[test]
1835    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1836        let scanned = scan_single_file_with_license_engine(
1837            "README.md",
1838            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1839            &TextDetectionOptions::default(),
1840        );
1841
1842        assert!(
1843            scanned
1844                .detected_license_expression
1845                .as_deref()
1846                .is_some_and(|expression| {
1847                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1848                }),
1849            "license expression: {:?}",
1850            scanned.detected_license_expression
1851        );
1852    }
1853
1854    #[test]
1855    fn scanner_prefers_dual_license_readme_expression_over_supplemental_mentions() {
1856        let scanned = scan_single_file_with_license_engine(
1857            "README.md",
1858            concat!(
1859                "## License\n\n",
1860                "Licensed under either of:\n\n",
1861                " * [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)\n",
1862                " * [MIT license](https://opensource.org/licenses/MIT)\n\n",
1863                "at your option.\n\n",
1864                "### Contribution\n\n",
1865                "Unless you explicitly state otherwise, any contribution intentionally submitted\n",
1866                "for inclusion in the work by you, as defined in the Apache-2.0 license, shall be\n",
1867                "dual licensed as above, without any additional terms or conditions.\n",
1868            ),
1869            &TextDetectionOptions::default(),
1870        );
1871
1872        assert!(
1873            matches!(
1874                scanned.detected_license_expression.as_deref(),
1875                Some("Apache-2.0 OR MIT") | Some("MIT OR Apache-2.0")
1876            ),
1877            "license expression: {:?}",
1878            scanned.detected_license_expression
1879        );
1880        assert!(
1881            !scanned
1882                .license_detections
1883                .iter()
1884                .any(|detection| detection.license_expression_spdx == "Apache-2.0"),
1885            "detections: {:?}",
1886            scanned.license_detections
1887        );
1888    }
1889
1890    #[test]
1891    fn scanner_drops_redundant_conjunctive_readme_detection_when_or_notice_exists() {
1892        let scanned = scan_single_file_with_license_engine(
1893            "README.md",
1894            concat!(
1895                "## License\n\n",
1896                "Licensed under either of:\n\n",
1897                " * [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)\n",
1898                " * [MIT license](https://opensource.org/licenses/MIT)\n\n",
1899                "at your option.\n\n",
1900                "### Contribution\n\n",
1901                "Unless you explicitly state otherwise, any contribution intentionally submitted\n",
1902                "for inclusion in the work by you, as defined in the Apache-2.0 license, shall be\n",
1903                "dual licensed as above, without any additional terms or conditions.\n\n",
1904                "[license-image]: https://img.shields.io/badge/license-Apache2.0/MIT-blue.svg\n",
1905            ),
1906            &TextDetectionOptions::default(),
1907        );
1908
1909        assert!(
1910            !scanned
1911                .license_detections
1912                .iter()
1913                .any(|detection| { detection.license_expression_spdx == "Apache-2.0 AND MIT" })
1914        );
1915    }
1916
1917    #[test]
1918    fn scanner_drops_unknown_placeholder_from_dual_license_readme_notice() {
1919        let scanned = scan_single_file_with_license_engine(
1920            "README.md",
1921            concat!(
1922                "## License\n\n",
1923                "This project is dual-licensed under MIT and Apache 2.0.\n",
1924            ),
1925            &TextDetectionOptions::default(),
1926        );
1927
1928        assert!(
1929            matches!(
1930                scanned.detected_license_expression.as_deref(),
1931                Some("Apache-2.0 OR MIT") | Some("MIT OR Apache-2.0")
1932            ),
1933            "license expression: {:?}",
1934            scanned.detected_license_expression
1935        );
1936        assert!(scanned.license_detections.iter().any(|detection| {
1937            detection
1938                .license_expression_spdx
1939                .contains("Apache-2.0 OR MIT")
1940                || detection
1941                    .license_expression_spdx
1942                    .contains("MIT OR Apache-2.0")
1943        }));
1944        assert!(!scanned.license_detections.iter().any(|detection| {
1945            detection.license_expression_spdx == "LicenseRef-scancode-unknown-license-reference"
1946        }));
1947        assert!(
1948            scanned
1949                .license_detections
1950                .iter()
1951                .any(|detection| detection.license_expression_spdx == "MIT"),
1952            "detections: {:?}",
1953            scanned.license_detections
1954        );
1955    }
1956
1957    #[test]
1958    fn scanner_sets_is_source_only_when_info_enabled() {
1959        let without_info = TextDetectionOptions {
1960            collect_info: false,
1961            detect_packages: false,
1962            detect_application_packages: false,
1963            detect_system_packages: false,
1964            detect_packages_in_compiled: false,
1965            detect_copyrights: false,
1966            detect_generated: false,
1967            detect_emails: false,
1968            detect_urls: false,
1969            max_emails: 50,
1970            max_urls: 50,
1971            timeout_seconds: 120.0,
1972        };
1973        let with_info = TextDetectionOptions {
1974            collect_info: true,
1975            ..without_info.clone()
1976        };
1977
1978        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1979        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1980
1981        assert_eq!(scanned_without_info.is_source, None);
1982        assert_eq!(scanned_with_info.is_source, Some(true));
1983    }
1984
1985    #[test]
1986    fn directory_omits_info_fields_when_info_disabled() {
1987        let temp_dir = TempDir::new().expect("create temp dir");
1988        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1989
1990        let collected = collect_paths(temp_dir.path(), 0, &[]);
1991        let result = process_collected(
1992            &collected,
1993            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1994            None,
1995            LicenseScanOptions::default(),
1996            &TextDetectionOptions {
1997                collect_info: false,
1998                detect_packages: false,
1999                detect_application_packages: false,
2000                detect_system_packages: false,
2001                detect_packages_in_compiled: false,
2002                detect_copyrights: false,
2003                detect_generated: false,
2004                detect_emails: false,
2005                detect_urls: false,
2006                max_emails: 50,
2007                max_urls: 50,
2008                timeout_seconds: 120.0,
2009            },
2010        );
2011
2012        let directory = result
2013            .files
2014            .into_iter()
2015            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
2016            .expect("directory entry");
2017
2018        assert!(directory.date.is_none());
2019        assert!(directory.file_type_label.is_none());
2020        assert!(directory.is_binary.is_none());
2021        assert!(directory.is_text.is_none());
2022        assert!(directory.is_archive.is_none());
2023        assert!(directory.is_media.is_none());
2024        assert!(directory.is_source.is_none());
2025        assert!(directory.is_script.is_none());
2026    }
2027
2028    #[test]
2029    fn directory_includes_info_fields_when_info_enabled() {
2030        let temp_dir = TempDir::new().expect("create temp dir");
2031        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
2032
2033        let collected = collect_paths(temp_dir.path(), 0, &[]);
2034        let result = process_collected(
2035            &collected,
2036            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2037            None,
2038            LicenseScanOptions::default(),
2039            &TextDetectionOptions {
2040                collect_info: true,
2041                detect_packages: false,
2042                detect_application_packages: false,
2043                detect_system_packages: false,
2044                detect_packages_in_compiled: false,
2045                detect_copyrights: false,
2046                detect_generated: false,
2047                detect_emails: false,
2048                detect_urls: false,
2049                max_emails: 50,
2050                max_urls: 50,
2051                timeout_seconds: 120.0,
2052            },
2053        );
2054
2055        let directory = result
2056            .files
2057            .into_iter()
2058            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
2059            .expect("directory entry");
2060
2061        assert!(directory.date.is_none());
2062        assert!(directory.file_type_label.is_none());
2063        assert_eq!(directory.is_binary, Some(false));
2064        assert_eq!(directory.is_text, Some(false));
2065        assert_eq!(directory.is_archive, Some(false));
2066        assert_eq!(directory.is_media, Some(false));
2067        assert_eq!(directory.is_source, Some(false));
2068        assert_eq!(directory.is_script, Some(false));
2069        assert_eq!(directory.files_count, Some(0));
2070        assert_eq!(directory.dirs_count, Some(0));
2071        assert_eq!(directory.size_count, Some(0));
2072    }
2073
2074    #[test]
2075    fn collect_paths_includes_root_directory_entry() {
2076        let temp_dir = TempDir::new().expect("create temp dir");
2077        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
2078        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
2079            .expect("write nested file");
2080
2081        let collected = collect_paths(temp_dir.path(), 0, &[]);
2082
2083        assert!(
2084            collected
2085                .directories
2086                .iter()
2087                .any(|(path, _)| path == temp_dir.path())
2088        );
2089    }
2090
2091    #[test]
2092    fn collect_paths_supports_single_file_input() {
2093        let temp_dir = TempDir::new().expect("create temp dir");
2094        let file_path = temp_dir.path().join("main.rs");
2095        fs::write(&file_path, "fn main() {}\n").expect("write file");
2096
2097        let collected = collect_paths(&file_path, 0, &[]);
2098
2099        assert_eq!(collected.files.len(), 1);
2100        assert!(collected.directories.is_empty());
2101        assert_eq!(collected.files[0].0, file_path);
2102    }
2103
2104    #[cfg(unix)]
2105    #[test]
2106    fn collect_selected_paths_does_not_walk_unselected_siblings() {
2107        use std::os::unix::fs::PermissionsExt;
2108
2109        let temp_dir = TempDir::new().expect("create temp dir");
2110        let root = temp_dir.path();
2111        fs::create_dir_all(root.join("selected/docs")).expect("create selected dir");
2112        fs::create_dir_all(root.join("blocked/secret")).expect("create blocked dir");
2113        fs::write(root.join("selected/docs/guide.md"), "# guide\n").expect("write guide");
2114
2115        let blocked = root.join("blocked");
2116        let mut perms = fs::metadata(&blocked)
2117            .expect("blocked metadata")
2118            .permissions();
2119        perms.set_mode(0o000);
2120        fs::set_permissions(&blocked, perms).expect("remove blocked permissions");
2121
2122        let collected = collect_selected_paths(
2123            root,
2124            &[CollectionFrontier {
2125                path: PathBuf::from("selected"),
2126                recurse: true,
2127            }],
2128            0,
2129            &[],
2130        );
2131
2132        let mut restore = fs::metadata(&blocked)
2133            .expect("blocked metadata")
2134            .permissions();
2135        restore.set_mode(0o755);
2136        fs::set_permissions(&blocked, restore).expect("restore blocked permissions");
2137
2138        assert!(
2139            collected.collection_errors.is_empty(),
2140            "{:#?}",
2141            collected.collection_errors
2142        );
2143        assert!(
2144            collected
2145                .files
2146                .iter()
2147                .any(|(path, _)| path == &root.join("selected/docs/guide.md"))
2148        );
2149        assert!(
2150            collected
2151                .files
2152                .iter()
2153                .all(|(path, _): &(PathBuf, fs::Metadata)| !path.starts_with(&blocked))
2154        );
2155    }
2156
2157    #[test]
2158    fn collect_selected_paths_respects_excluded_ancestor_directories() {
2159        let temp_dir = TempDir::new().expect("create temp dir");
2160        let root = temp_dir.path();
2161        fs::create_dir_all(root.join(".git")).expect("create git dir");
2162        fs::write(
2163            root.join(".git/config"),
2164            "[core]\nrepositoryformatversion = 0\n",
2165        )
2166        .expect("write git config");
2167
2168        let exclude_patterns =
2169            build_collection_exclude_patterns(root, &root.join(".provenant-cache"));
2170        let collected = collect_selected_paths(
2171            root,
2172            &[CollectionFrontier {
2173                path: PathBuf::from(".git/config"),
2174                recurse: false,
2175            }],
2176            0,
2177            &exclude_patterns,
2178        );
2179
2180        assert!(collected.files.is_empty());
2181        assert!(collected.directories.iter().all(|(path, _)| path == root));
2182        assert_eq!(collected.excluded_count, 1);
2183    }
2184
2185    #[test]
2186    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
2187        let temp_dir = TempDir::new().expect("create temp dir");
2188        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2189        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
2190
2191        let collected = collect_paths(temp_dir.path(), 0, &[]);
2192        let result = process_collected_with_memory_limit(
2193            &collected,
2194            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2195            None,
2196            LicenseScanOptions::default(),
2197            &TextDetectionOptions {
2198                collect_info: false,
2199                detect_packages: false,
2200                detect_application_packages: false,
2201                detect_system_packages: false,
2202                detect_packages_in_compiled: false,
2203                detect_copyrights: false,
2204                detect_generated: false,
2205                detect_emails: false,
2206                detect_urls: false,
2207                max_emails: 50,
2208                max_urls: 50,
2209                timeout_seconds: 120.0,
2210            },
2211            MemoryMode::Limit(1),
2212        );
2213
2214        assert_eq!(result.files.len(), 3);
2215    }
2216
2217    #[test]
2218    fn process_collected_with_negative_one_uses_disk_only_mode() {
2219        let temp_dir = TempDir::new().expect("create temp dir");
2220        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2221
2222        let collected = collect_paths(temp_dir.path(), 0, &[]);
2223        let result = process_collected_with_memory_limit(
2224            &collected,
2225            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2226            None,
2227            LicenseScanOptions::default(),
2228            &TextDetectionOptions {
2229                collect_info: false,
2230                detect_packages: false,
2231                detect_application_packages: false,
2232                detect_system_packages: false,
2233                detect_packages_in_compiled: false,
2234                detect_copyrights: false,
2235                detect_generated: false,
2236                detect_emails: false,
2237                detect_urls: false,
2238                max_emails: 50,
2239                max_urls: 50,
2240                timeout_seconds: 120.0,
2241            },
2242            MemoryMode::StreamUnlimited,
2243        );
2244
2245        assert_eq!(result.files.len(), 2);
2246    }
2247}