Skip to main content

provenant/scanner/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod collect;
5mod process;
6
7use crate::license_detection::LicenseDetectionEngine;
8use crate::models::FileInfo;
9
10pub struct ProcessResult {
11    pub files: Vec<FileInfo>,
12    pub excluded_count: usize,
13}
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct LicenseScanOptions {
17    pub include_text: bool,
18    pub include_text_diagnostics: bool,
19    pub include_diagnostics: bool,
20    pub unknown_licenses: bool,
21    pub min_score: u8,
22}
23
24#[derive(Debug, Clone)]
25pub struct TextDetectionOptions {
26    pub collect_info: bool,
27    pub detect_packages: bool,
28    pub detect_application_packages: bool,
29    pub detect_system_packages: bool,
30    pub detect_packages_in_compiled: bool,
31    pub detect_copyrights: bool,
32    pub detect_generated: bool,
33    pub detect_emails: bool,
34    pub detect_urls: bool,
35    pub max_emails: usize,
36    pub max_urls: usize,
37    pub timeout_seconds: f64,
38}
39
40impl Default for TextDetectionOptions {
41    fn default() -> Self {
42        Self {
43            collect_info: false,
44            detect_packages: false,
45            detect_application_packages: false,
46            detect_system_packages: false,
47            detect_packages_in_compiled: false,
48            detect_copyrights: true,
49            detect_generated: false,
50            detect_emails: false,
51            detect_urls: false,
52            max_emails: 50,
53            max_urls: 50,
54            timeout_seconds: 120.0,
55        }
56    }
57}
58
59pub fn scan_options_fingerprint(
60    text_options: &TextDetectionOptions,
61    license_options: LicenseScanOptions,
62    license_engine: Option<&LicenseDetectionEngine>,
63) -> String {
64    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
65        Some(engine) => {
66            let rules = &engine.index().rules_by_rid;
67            (
68                true,
69                rules.len(),
70                rules
71                    .first()
72                    .map(|rule| rule.identifier.as_str())
73                    .unwrap_or(""),
74                rules
75                    .last()
76                    .map(|rule| rule.identifier.as_str())
77                    .unwrap_or(""),
78            )
79        }
80        None => (false, 0, "", ""),
81    };
82
83    format!(
84        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
85        crate::version::BUILD_VERSION,
86        text_options.collect_info,
87        text_options.detect_packages,
88        text_options.detect_application_packages,
89        text_options.detect_system_packages,
90        text_options.detect_packages_in_compiled,
91        text_options.detect_copyrights,
92        text_options.detect_generated,
93        text_options.detect_emails,
94        text_options.detect_urls,
95        text_options.max_emails,
96        text_options.max_urls,
97        text_options.timeout_seconds,
98        license_enabled,
99        rules_count,
100        first_rule_id,
101        last_rule_id,
102        license_options.include_text,
103        license_options.include_text_diagnostics,
104        license_options.include_diagnostics,
105        license_options.unknown_licenses,
106        license_options.min_score,
107    )
108}
109
110pub use self::collect::{
111    CollectedPaths, CollectionFrontier, collect_paths, collect_selected_paths,
112};
113#[allow(unused_imports)]
114pub use self::process::{
115    MemoryMode, process_collected, process_collected_sequential,
116    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
117};
118
119#[cfg(test)]
120mod tests {
121    use std::fs;
122    use std::path::PathBuf;
123    use std::sync::Arc;
124
125    use object::pe;
126    use tempfile::TempDir;
127
128    use crate::cache::build_collection_exclude_patterns;
129    use crate::license_detection::LicenseDetectionEngine;
130    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
131    use crate::progress::{ProgressMode, ScanProgress};
132
133    use super::{
134        CollectionFrontier, LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths,
135        collect_selected_paths, process_collected, process_collected_with_memory_limit,
136        scan_options_fingerprint,
137    };
138
139    fn build_sparse_oversized_rpm_with_filename(
140        temp_dir: &TempDir,
141        package_name: &str,
142        filename: &str,
143    ) -> PathBuf {
144        let file_path = temp_dir.path().join(filename);
145        rpm::PackageBuilder::new(package_name, "1.0", "MIT", "x86_64", "Demo RPM package")
146            .release("1")
147            .build()
148            .expect("build rpm fixture")
149            .write_file(&file_path)
150            .expect("write rpm fixture");
151        fs::OpenOptions::new()
152            .write(true)
153            .open(&file_path)
154            .expect("open rpm fixture for sparse extension")
155            .set_len(100 * 1024 * 1024 + 1_048_576)
156            .expect("extend rpm fixture");
157        file_path
158    }
159
160    fn build_sparse_oversized_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
161        build_sparse_oversized_rpm_with_filename(
162            temp_dir,
163            name,
164            &format!("{name}-1.0-1.x86_64.rpm"),
165        )
166    }
167
168    fn build_sparse_oversized_pack_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
169        build_sparse_oversized_rpm_with_filename(
170            temp_dir,
171            name,
172            &format!("{name}-1.0-1.x86_64.pack"),
173        )
174    }
175
176    #[test]
177    fn default_options_keep_copyright_detection_enabled() {
178        let options = TextDetectionOptions::default();
179        assert!(!options.detect_packages);
180        assert!(options.detect_copyrights);
181    }
182
183    #[test]
184    fn test_scan_options_fingerprint_changes_with_license_score() {
185        let text_options = TextDetectionOptions::default();
186        let default_fingerprint = scan_options_fingerprint(
187            &text_options,
188            LicenseScanOptions {
189                min_score: 0,
190                ..LicenseScanOptions::default()
191            },
192            None,
193        );
194        let filtered_fingerprint = scan_options_fingerprint(
195            &text_options,
196            LicenseScanOptions {
197                min_score: 70,
198                ..LicenseScanOptions::default()
199            },
200            None,
201        );
202
203        assert_ne!(default_fingerprint, filtered_fingerprint);
204    }
205
206    fn scan_single_file(
207        file_name: &str,
208        content: &str,
209        options: &TextDetectionOptions,
210    ) -> crate::models::FileInfo {
211        let temp_dir = TempDir::new().expect("create temp dir");
212        let file_path = temp_dir.path().join(file_name);
213        fs::write(&file_path, content).expect("write test file");
214
215        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
216        let collected = collect_paths(temp_dir.path(), 0, &[]);
217        let result = process_collected(
218            &collected,
219            progress,
220            None,
221            LicenseScanOptions::default(),
222            options,
223        );
224
225        result
226            .files
227            .into_iter()
228            .find(|entry| {
229                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
230            })
231            .expect("scanned file entry")
232    }
233
234    fn scan_file_at_relative_path(
235        relative_path: &str,
236        content: &[u8],
237        options: &TextDetectionOptions,
238    ) -> crate::models::FileInfo {
239        let temp_dir = TempDir::new().expect("create temp dir");
240        let file_path = temp_dir.path().join(relative_path);
241        if let Some(parent) = file_path.parent() {
242            fs::create_dir_all(parent).expect("create parent dirs");
243        }
244        fs::write(&file_path, content).expect("write test file");
245
246        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
247        let collected = collect_paths(temp_dir.path(), 0, &[]);
248        let result = process_collected(
249            &collected,
250            progress,
251            None,
252            LicenseScanOptions::default(),
253            options,
254        );
255
256        result
257            .files
258            .into_iter()
259            .find(|entry| {
260                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
261            })
262            .expect("scanned file entry")
263    }
264
265    fn scan_single_file_with_license_engine(
266        file_name: &str,
267        content: &str,
268        options: &TextDetectionOptions,
269    ) -> crate::models::FileInfo {
270        let temp_dir = TempDir::new().expect("create temp dir");
271        let file_path = temp_dir.path().join(file_name);
272        fs::write(&file_path, content).expect("write test file");
273
274        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
275        let collected = collect_paths(temp_dir.path(), 0, &[]);
276        let engine =
277            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
278        let result = process_collected(
279            &collected,
280            progress,
281            Some(engine),
282            LicenseScanOptions::default(),
283            options,
284        );
285
286        result
287            .files
288            .into_iter()
289            .find(|entry| {
290                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
291            })
292            .expect("scanned file entry")
293    }
294
295    #[test]
296    fn scanner_reports_repeated_email_occurrences() {
297        let options = TextDetectionOptions {
298            collect_info: false,
299            detect_packages: false,
300            detect_application_packages: false,
301            detect_system_packages: false,
302            detect_packages_in_compiled: false,
303            detect_copyrights: false,
304            detect_generated: false,
305            detect_emails: true,
306            detect_urls: false,
307            max_emails: 50,
308            max_urls: 50,
309            timeout_seconds: 120.0,
310        };
311        let scanned = scan_single_file(
312            "contacts.txt",
313            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
314            &options,
315        );
316
317        let emails: Vec<(&str, usize)> = scanned
318            .emails
319            .iter()
320            .map(|email| (email.email.as_str(), email.start_line.get()))
321            .collect();
322
323        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
324        assert_eq!(
325            emails,
326            vec![
327                ("linux@3ware.com", 1),
328                ("linux@3ware.com", 2),
329                ("andre@suse.com", 3),
330                ("linux@3ware.com", 4),
331            ]
332        );
333    }
334
335    #[test]
336    fn scanner_skips_pem_certificate_text_detection() {
337        let options = TextDetectionOptions {
338            collect_info: false,
339            detect_packages: false,
340            detect_application_packages: false,
341            detect_system_packages: false,
342            detect_packages_in_compiled: false,
343            detect_copyrights: true,
344            detect_generated: false,
345            detect_emails: true,
346            detect_urls: true,
347            max_emails: 50,
348            max_urls: 50,
349            timeout_seconds: 120.0,
350        };
351        let pem_fixture = concat!(
352            "-----BEGIN CERTIFICATE-----\n",
353            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
354            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
355            "-----END CERTIFICATE-----\n",
356            "Certificate:\n",
357            "    Data:\n",
358            "        Signature Algorithm: sha1WithRSAEncryption\n",
359            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
360            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
361            "        Contact: cert-owner@example.com\n",
362        );
363        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
364
365        assert!(
366            scanned.copyrights.is_empty(),
367            "copyrights: {:#?}",
368            scanned.copyrights
369        );
370        assert!(
371            scanned.holders.is_empty(),
372            "holders: {:#?}",
373            scanned.holders
374        );
375        assert!(
376            scanned.authors.is_empty(),
377            "authors: {:#?}",
378            scanned.authors
379        );
380        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
381        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
382        assert!(
383            scanned.license_detections.is_empty(),
384            "licenses: {:#?}",
385            scanned.license_detections
386        );
387        assert!(
388            scanned.license_clues.is_empty(),
389            "license clues: {:#?}",
390            scanned.license_clues
391        );
392    }
393
394    #[test]
395    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
396        let options = TextDetectionOptions {
397            collect_info: false,
398            detect_packages: false,
399            detect_application_packages: false,
400            detect_system_packages: false,
401            detect_packages_in_compiled: false,
402            detect_copyrights: true,
403            detect_generated: false,
404            detect_emails: false,
405            detect_urls: true,
406            max_emails: 50,
407            max_urls: 50,
408            timeout_seconds: 120.0,
409        };
410        let fixture = concat!(
411            "/*\n",
412            "Copyright 2022 The Kubernetes Authors.\n\n",
413            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
414            "you may not use this file except in compliance with the License.\n",
415            "You may obtain a copy of the License at\n\n",
416            "    http://www.apache.org/licenses/LICENSE-2.0\n",
417            "*/\n\n",
418            "package storage\n\n",
419            "const validCert = `\n",
420            "-----BEGIN CERTIFICATE-----\n",
421            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
422            "-----END CERTIFICATE-----\n",
423            "`\n",
424        );
425        let temp_dir = TempDir::new().expect("create temp dir");
426        let file_path = temp_dir.path().join("storage_test.go");
427        fs::write(&file_path, fixture).expect("write fixture");
428
429        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
430        let collected = collect_paths(temp_dir.path(), 0, &[]);
431        let engine =
432            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
433        let result = process_collected(
434            &collected,
435            progress,
436            Some(engine),
437            LicenseScanOptions::default(),
438            &options,
439        );
440        let scanned = result
441            .files
442            .into_iter()
443            .find(|entry| {
444                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
445            })
446            .expect("scanned file entry");
447
448        assert!(
449            scanned
450                .copyrights
451                .iter()
452                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
453            "copyrights: {:#?}",
454            scanned.copyrights
455        );
456        assert!(
457            scanned
458                .holders
459                .iter()
460                .any(|h| h.holder == "The Kubernetes Authors"),
461            "holders: {:#?}",
462            scanned.holders
463        );
464        assert!(
465            scanned
466                .urls
467                .iter()
468                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
469            "urls: {:#?}",
470            scanned.urls
471        );
472        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
473    }
474
475    #[test]
476    fn scanner_detects_structured_credits_authors() {
477        let options = TextDetectionOptions {
478            collect_info: false,
479            detect_packages: false,
480            detect_application_packages: false,
481            detect_system_packages: false,
482            detect_packages_in_compiled: false,
483            detect_copyrights: true,
484            detect_generated: false,
485            detect_emails: false,
486            detect_urls: false,
487            max_emails: 50,
488            max_urls: 50,
489            timeout_seconds: 120.0,
490        };
491        let credits_fixture = concat!(
492            "N: Jack Lloyd\n",
493            "E: lloyd@randombit.net\n",
494            "W: http://www.randombit.net/\n",
495        );
496        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
497
498        let authors: Vec<(&str, usize, usize)> = scanned
499            .authors
500            .iter()
501            .map(|author| {
502                (
503                    author.author.as_str(),
504                    author.start_line.get(),
505                    author.end_line.get(),
506                )
507            })
508            .collect();
509
510        assert_eq!(
511            authors,
512            vec![(
513                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
514                1,
515                3,
516            )]
517        );
518        assert!(scanned.copyrights.is_empty());
519        assert!(scanned.holders.is_empty());
520    }
521
522    #[test]
523    fn scanner_uses_or_for_alternative_license_header() {
524        let fixture =
525            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
526        let temp_dir = TempDir::new().expect("create temp dir");
527        let file_path = temp_dir.path().join("d2s.ipp");
528        fs::write(&file_path, fixture).expect("write fixture");
529
530        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
531        let collected = collect_paths(temp_dir.path(), 0, &[]);
532        let engine =
533            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
534        let result = process_collected(
535            &collected,
536            progress,
537            Some(engine),
538            LicenseScanOptions::default(),
539            &TextDetectionOptions::default(),
540        );
541        let scanned = result
542            .files
543            .into_iter()
544            .find(|entry| {
545                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
546            })
547            .expect("scanned file entry");
548
549        assert_eq!(
550            scanned.license_expression.as_deref(),
551            Some("Apache-2.0 OR BSL-1.0")
552        );
553        assert!(
554            scanned.license_clues.is_empty(),
555            "license clues: {:#?}",
556            scanned.license_clues
557        );
558        assert_eq!(
559            scanned.license_detections.len(),
560            1,
561            "detections: {:#?}",
562            scanned.license_detections
563        );
564
565        let detection = &scanned.license_detections[0];
566        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
567
568        let match_expressions: Vec<_> = detection
569            .matches
570            .iter()
571            .map(|m| m.license_expression_spdx.as_str())
572            .collect();
573        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
574    }
575
576    #[test]
577    fn scanner_sets_generated_flag_when_enabled() {
578        let options = TextDetectionOptions {
579            collect_info: false,
580            detect_packages: false,
581            detect_application_packages: false,
582            detect_system_packages: false,
583            detect_packages_in_compiled: false,
584            detect_copyrights: false,
585            detect_generated: true,
586            detect_emails: false,
587            detect_urls: false,
588            max_emails: 50,
589            max_urls: 50,
590            timeout_seconds: 120.0,
591        };
592        let scanned = scan_single_file(
593            "generated.c",
594            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
595            &options,
596        );
597
598        assert_eq!(scanned.is_generated, Some(true));
599    }
600
601    #[test]
602    fn scanner_leaves_generated_flag_unset_when_disabled() {
603        let options = TextDetectionOptions {
604            collect_info: false,
605            detect_packages: false,
606            detect_application_packages: false,
607            detect_system_packages: false,
608            detect_packages_in_compiled: false,
609            detect_copyrights: false,
610            detect_generated: false,
611            detect_emails: false,
612            detect_urls: false,
613            max_emails: 50,
614            max_urls: 50,
615            timeout_seconds: 120.0,
616        };
617        let scanned = scan_single_file(
618            "generated.c",
619            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
620            &options,
621        );
622
623        assert_eq!(scanned.is_generated, None);
624    }
625
626    #[test]
627    fn scanner_populates_info_surface_when_enabled() {
628        let options = TextDetectionOptions {
629            collect_info: true,
630            detect_packages: false,
631            detect_application_packages: false,
632            detect_system_packages: false,
633            detect_packages_in_compiled: false,
634            detect_copyrights: false,
635            detect_generated: false,
636            detect_emails: false,
637            detect_urls: false,
638            max_emails: 50,
639            max_urls: 50,
640            timeout_seconds: 120.0,
641        };
642        let scanned = scan_single_file(
643            "script.py",
644            "#!/usr/bin/env python3\nprint(\"hello\")\n",
645            &options,
646        );
647
648        assert!(scanned.sha1.is_some());
649        assert!(scanned.md5.is_some());
650        assert!(scanned.sha256.is_some());
651        assert!(scanned.sha1_git.is_some());
652        assert!(scanned.mime_type.is_some());
653        assert!(scanned.date.is_some());
654        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
655        assert_eq!(scanned.is_text, Some(true));
656        assert_eq!(scanned.is_script, Some(true));
657        assert_eq!(scanned.is_source, Some(true));
658    }
659
660    #[test]
661    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
662        let options = TextDetectionOptions {
663            collect_info: true,
664            detect_packages: false,
665            detect_application_packages: false,
666            detect_system_packages: false,
667            detect_packages_in_compiled: false,
668            detect_copyrights: false,
669            detect_generated: false,
670            detect_emails: false,
671            detect_urls: false,
672            max_emails: 50,
673            max_urls: 50,
674            timeout_seconds: 120.0,
675        };
676        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
677        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
678
679        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
680        assert_eq!(
681            scanned.file_type_label.as_deref(),
682            Some("python script, text executable")
683        );
684        assert_eq!(scanned.is_binary, Some(false));
685        assert_eq!(scanned.is_text, Some(true));
686        assert_eq!(scanned.is_script, Some(true));
687        assert_eq!(scanned.is_source, Some(true));
688    }
689
690    #[test]
691    fn scanner_skips_findings_for_zip_like_archives() {
692        let options = TextDetectionOptions {
693            collect_info: true,
694            detect_packages: false,
695            detect_application_packages: false,
696            detect_system_packages: false,
697            detect_packages_in_compiled: false,
698            detect_copyrights: true,
699            detect_generated: false,
700            detect_emails: true,
701            detect_urls: true,
702            max_emails: 50,
703            max_urls: 50,
704            timeout_seconds: 120.0,
705        };
706        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
707        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
708
709        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
710        assert_eq!(scanned.is_archive, Some(true));
711        assert!(scanned.license_detections.is_empty());
712        assert!(scanned.copyrights.is_empty());
713        assert!(scanned.emails.is_empty());
714        assert!(scanned.urls.is_empty());
715    }
716
717    #[test]
718    fn scanner_treats_typescript_sources_as_text_not_video_media() {
719        let options = TextDetectionOptions {
720            collect_info: true,
721            detect_packages: false,
722            detect_application_packages: false,
723            detect_system_packages: false,
724            detect_packages_in_compiled: false,
725            detect_copyrights: false,
726            detect_generated: false,
727            detect_emails: false,
728            detect_urls: false,
729            max_emails: 50,
730            max_urls: 50,
731            timeout_seconds: 120.0,
732        };
733        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
734
735        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
736        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
737        assert_eq!(
738            scanned.file_type_label.as_deref(),
739            Some("TypeScript source, UTF-8 Unicode text")
740        );
741        assert_eq!(scanned.is_text, Some(true));
742        assert_eq!(scanned.is_media, Some(false));
743        assert_eq!(scanned.is_script, Some(false));
744        assert_eq!(scanned.is_source, Some(true));
745    }
746
747    #[test]
748    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
749        let options = TextDetectionOptions {
750            collect_info: true,
751            detect_packages: false,
752            detect_application_packages: false,
753            detect_system_packages: false,
754            detect_packages_in_compiled: false,
755            detect_copyrights: false,
756            detect_generated: false,
757            detect_emails: false,
758            detect_urls: false,
759            max_emails: 50,
760            max_urls: 50,
761            timeout_seconds: 120.0,
762        };
763        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
764
765        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
766        assert_eq!(
767            scanned.file_type_label.as_deref(),
768            Some("TypeScript source, UTF-8 Unicode text")
769        );
770        assert_eq!(scanned.is_text, Some(true));
771        assert_eq!(scanned.is_media, Some(false));
772        assert_eq!(scanned.is_script, Some(false));
773        assert_eq!(scanned.is_source, Some(true));
774    }
775
776    #[test]
777    fn scanner_treats_empty_files_like_scancode_info_surface() {
778        let options = TextDetectionOptions {
779            collect_info: true,
780            detect_packages: false,
781            detect_application_packages: false,
782            detect_system_packages: false,
783            detect_packages_in_compiled: false,
784            detect_copyrights: false,
785            detect_generated: false,
786            detect_emails: false,
787            detect_urls: false,
788            max_emails: 50,
789            max_urls: 50,
790            timeout_seconds: 120.0,
791        };
792        let scanned = scan_single_file("test.txt", "", &options);
793
794        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
795        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
796        assert_eq!(scanned.programming_language, None);
797        assert_eq!(scanned.is_binary, Some(false));
798        assert_eq!(scanned.is_text, Some(true));
799        assert_eq!(scanned.is_archive, Some(false));
800        assert_eq!(scanned.is_media, Some(false));
801        assert_eq!(scanned.is_source, Some(false));
802        assert_eq!(scanned.is_script, Some(false));
803    }
804
805    #[test]
806    fn scanner_treats_package_json_as_text_not_source() {
807        let options = TextDetectionOptions {
808            collect_info: true,
809            detect_packages: false,
810            detect_application_packages: false,
811            detect_system_packages: false,
812            detect_packages_in_compiled: false,
813            detect_copyrights: false,
814            detect_generated: false,
815            detect_emails: false,
816            detect_urls: false,
817            max_emails: 50,
818            max_urls: 50,
819            timeout_seconds: 120.0,
820        };
821        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
822
823        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
824        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
825        assert_eq!(scanned.programming_language, None);
826        assert_eq!(scanned.is_text, Some(true));
827        assert_eq!(scanned.is_source, Some(false));
828        assert_eq!(scanned.is_script, Some(false));
829    }
830
831    #[test]
832    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
833        let options = TextDetectionOptions {
834            collect_info: true,
835            detect_packages: false,
836            detect_application_packages: false,
837            detect_system_packages: false,
838            detect_packages_in_compiled: false,
839            detect_copyrights: false,
840            detect_generated: false,
841            detect_emails: false,
842            detect_urls: false,
843            max_emails: 50,
844            max_urls: 50,
845            timeout_seconds: 120.0,
846        };
847
848        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
849        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
850
851        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
852        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
853        assert_eq!(gradle.is_source, Some(true));
854        assert_eq!(gradle.is_script, Some(false));
855
856        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
857        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
858        assert_eq!(nix.is_source, Some(true));
859        assert_eq!(nix.is_script, Some(false));
860    }
861
862    #[test]
863    fn scanner_treats_gitmodules_as_text_not_source() {
864        let options = TextDetectionOptions {
865            collect_info: true,
866            detect_packages: false,
867            detect_application_packages: false,
868            detect_system_packages: false,
869            detect_packages_in_compiled: false,
870            detect_copyrights: false,
871            detect_generated: false,
872            detect_emails: false,
873            detect_urls: false,
874            max_emails: 50,
875            max_urls: 50,
876            timeout_seconds: 120.0,
877        };
878        let scanned = scan_file_at_relative_path(
879            ".gitmodules",
880            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
881            &options,
882        );
883
884        assert_eq!(scanned.programming_language, None);
885        assert_eq!(
886            scanned.file_type_label.as_deref(),
887            Some("Git configuration text")
888        );
889        assert_eq!(scanned.is_text, Some(true));
890        assert_eq!(scanned.is_source, Some(false));
891        assert_eq!(scanned.is_script, Some(false));
892    }
893
894    #[test]
895    fn scanner_treats_javascript_shebang_files_as_scripts() {
896        let options = TextDetectionOptions {
897            collect_info: true,
898            detect_packages: false,
899            detect_application_packages: false,
900            detect_system_packages: false,
901            detect_packages_in_compiled: false,
902            detect_copyrights: false,
903            detect_generated: false,
904            detect_emails: false,
905            detect_urls: false,
906            max_emails: 50,
907            max_urls: 50,
908            timeout_seconds: 120.0,
909        };
910        let scanned = scan_file_at_relative_path(
911            "bin/run",
912            b"#!/usr/bin/env node\nconsole.log('hello');\n",
913            &options,
914        );
915
916        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
917        assert_eq!(
918            scanned.file_type_label.as_deref(),
919            Some("javascript script, UTF-8 Unicode text executable")
920        );
921        assert_eq!(scanned.is_script, Some(true));
922        assert_eq!(scanned.is_source, Some(true));
923    }
924
925    #[test]
926    fn scanner_treats_dockerfile_as_source() {
927        let options = TextDetectionOptions {
928            collect_info: true,
929            detect_packages: false,
930            detect_application_packages: false,
931            detect_system_packages: false,
932            detect_packages_in_compiled: false,
933            detect_copyrights: false,
934            detect_generated: false,
935            detect_emails: false,
936            detect_urls: false,
937            max_emails: 50,
938            max_urls: 50,
939            timeout_seconds: 120.0,
940        };
941        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
942
943        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
944        assert_eq!(
945            scanned.file_type_label.as_deref(),
946            Some("Dockerfile source, UTF-8 Unicode text")
947        );
948        assert_eq!(scanned.is_source, Some(true));
949        assert_eq!(scanned.is_script, Some(false));
950    }
951
952    #[test]
953    fn scanner_treats_makefile_as_text_not_source() {
954        let options = TextDetectionOptions {
955            collect_info: true,
956            detect_packages: false,
957            detect_application_packages: false,
958            detect_system_packages: false,
959            detect_packages_in_compiled: false,
960            detect_copyrights: false,
961            detect_generated: false,
962            detect_emails: false,
963            detect_urls: false,
964            max_emails: 50,
965            max_urls: 50,
966            timeout_seconds: 120.0,
967        };
968        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
969
970        assert_eq!(scanned.programming_language, None);
971        assert_eq!(
972            scanned.file_type_label.as_deref(),
973            Some("UTF-8 Unicode text")
974        );
975        assert_eq!(scanned.is_text, Some(true));
976        assert_eq!(scanned.is_source, Some(false));
977        assert_eq!(scanned.is_script, Some(false));
978    }
979
980    #[test]
981    fn scanner_omits_info_surface_when_disabled() {
982        let options = TextDetectionOptions {
983            collect_info: false,
984            detect_packages: false,
985            detect_application_packages: false,
986            detect_system_packages: false,
987            detect_packages_in_compiled: false,
988            detect_copyrights: false,
989            detect_generated: false,
990            detect_emails: false,
991            detect_urls: false,
992            max_emails: 50,
993            max_urls: 50,
994            timeout_seconds: 120.0,
995        };
996        let scanned = scan_single_file(
997            "script.py",
998            "#!/usr/bin/env python3\nprint(\"hello\")\n",
999            &options,
1000        );
1001
1002        assert!(scanned.sha1.is_none());
1003        assert!(scanned.md5.is_none());
1004        assert!(scanned.sha256.is_none());
1005        assert!(scanned.sha1_git.is_none());
1006        assert!(scanned.mime_type.is_none());
1007        assert!(scanned.date.is_none());
1008        assert!(scanned.programming_language.is_none());
1009        assert!(scanned.is_binary.is_none());
1010        assert!(scanned.is_text.is_none());
1011        assert!(scanned.is_archive.is_none());
1012        assert!(scanned.is_media.is_none());
1013        assert!(scanned.is_script.is_none());
1014        assert!(scanned.is_source.is_none());
1015    }
1016
1017    #[test]
1018    fn scanner_skips_package_parsing_when_disabled() {
1019        let options = TextDetectionOptions {
1020            collect_info: false,
1021            detect_packages: false,
1022            detect_application_packages: false,
1023            detect_system_packages: false,
1024            detect_packages_in_compiled: false,
1025            detect_copyrights: false,
1026            detect_generated: false,
1027            detect_emails: false,
1028            detect_urls: false,
1029            max_emails: 50,
1030            max_urls: 50,
1031            timeout_seconds: 120.0,
1032        };
1033        let scanned = scan_single_file(
1034            "package.json",
1035            r#"{"name":"demo","version":"1.0.0"}"#,
1036            &options,
1037        );
1038
1039        assert!(
1040            scanned.package_data.is_empty(),
1041            "package_data: {:#?}",
1042            scanned.package_data
1043        );
1044    }
1045
1046    #[test]
1047    fn scanner_parses_package_manifests_when_enabled() {
1048        let options = TextDetectionOptions {
1049            collect_info: false,
1050            detect_packages: true,
1051            detect_application_packages: true,
1052            detect_system_packages: false,
1053            detect_packages_in_compiled: false,
1054            detect_copyrights: false,
1055            detect_generated: false,
1056            detect_emails: false,
1057            detect_urls: false,
1058            max_emails: 50,
1059            max_urls: 50,
1060            timeout_seconds: 120.0,
1061        };
1062        let scanned = scan_single_file(
1063            "package.json",
1064            r#"{"name":"demo","version":"1.0.0"}"#,
1065            &options,
1066        );
1067
1068        assert_eq!(
1069            scanned.package_data.len(),
1070            1,
1071            "package_data: {:#?}",
1072            scanned.package_data
1073        );
1074    }
1075
1076    #[test]
1077    fn scanner_parses_oversized_rpm_in_package_only_mode_without_size_warning() {
1078        let temp_dir = TempDir::new().expect("create temp dir");
1079        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-demo");
1080
1081        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1082        let collected = collect_paths(temp_dir.path(), 0, &[]);
1083        let result = process_collected(
1084            &collected,
1085            progress,
1086            None,
1087            LicenseScanOptions::default(),
1088            &TextDetectionOptions {
1089                collect_info: false,
1090                detect_packages: true,
1091                detect_application_packages: true,
1092                detect_system_packages: false,
1093                detect_packages_in_compiled: false,
1094                detect_copyrights: false,
1095                detect_generated: false,
1096                detect_emails: false,
1097                detect_urls: false,
1098                max_emails: 50,
1099                max_urls: 50,
1100                timeout_seconds: 120.0,
1101            },
1102        );
1103
1104        let scanned = result
1105            .files
1106            .into_iter()
1107            .find(|entry| {
1108                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1109            })
1110            .expect("scanned file entry");
1111
1112        assert!(
1113            scanned.scan_errors.is_empty(),
1114            "scan_errors: {:#?}",
1115            scanned.scan_errors
1116        );
1117        assert_eq!(
1118            scanned.package_data.len(),
1119            1,
1120            "package_data: {:#?}",
1121            scanned.package_data
1122        );
1123        assert_eq!(
1124            scanned.package_data[0].datasource_id,
1125            Some(DatasourceId::RpmArchive)
1126        );
1127        assert_eq!(
1128            scanned.package_data[0].name.as_deref(),
1129            Some("oversized-demo")
1130        );
1131        assert_eq!(scanned.package_data[0].version.as_deref(), Some("1.0-1"));
1132    }
1133
1134    #[test]
1135    fn scanner_parses_oversized_rpm_with_info_without_timeout_or_size_warning() {
1136        let temp_dir = TempDir::new().expect("create temp dir");
1137        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-info-demo");
1138
1139        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1140        let collected = collect_paths(temp_dir.path(), 0, &[]);
1141        let result = process_collected(
1142            &collected,
1143            progress,
1144            None,
1145            LicenseScanOptions::default(),
1146            &TextDetectionOptions {
1147                collect_info: true,
1148                detect_packages: true,
1149                detect_application_packages: true,
1150                detect_system_packages: false,
1151                detect_packages_in_compiled: false,
1152                detect_copyrights: false,
1153                detect_generated: false,
1154                detect_emails: false,
1155                detect_urls: false,
1156                max_emails: 50,
1157                max_urls: 50,
1158                timeout_seconds: 120.0,
1159            },
1160        );
1161
1162        let scanned = result
1163            .files
1164            .into_iter()
1165            .find(|entry| {
1166                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1167            })
1168            .expect("scanned file entry");
1169
1170        assert!(
1171            scanned.scan_errors.is_empty(),
1172            "scan_errors: {:#?}",
1173            scanned.scan_errors
1174        );
1175        assert_eq!(
1176            scanned.package_data.len(),
1177            1,
1178            "package_data: {:#?}",
1179            scanned.package_data
1180        );
1181        assert_eq!(
1182            scanned.package_data[0].datasource_id,
1183            Some(DatasourceId::RpmArchive)
1184        );
1185        assert_eq!(
1186            scanned.package_data[0].name.as_deref(),
1187            Some("oversized-info-demo")
1188        );
1189        assert!(scanned.sha1.is_some());
1190        assert!(scanned.md5.is_some());
1191        assert!(scanned.sha256.is_some());
1192        assert!(scanned.sha1_git.is_some());
1193        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1194        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1195        assert_eq!(scanned.is_binary, Some(true));
1196        assert_eq!(scanned.is_text, Some(false));
1197        assert_eq!(scanned.is_archive, Some(true));
1198    }
1199
1200    #[test]
1201    fn scanner_parses_oversized_pack_rpm_in_package_only_mode_without_size_warning() {
1202        let temp_dir = TempDir::new().expect("create temp dir");
1203        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-demo");
1204
1205        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1206        let collected = collect_paths(temp_dir.path(), 0, &[]);
1207        let result = process_collected(
1208            &collected,
1209            progress,
1210            None,
1211            LicenseScanOptions::default(),
1212            &TextDetectionOptions {
1213                collect_info: false,
1214                detect_packages: true,
1215                detect_application_packages: true,
1216                detect_system_packages: false,
1217                detect_packages_in_compiled: false,
1218                detect_copyrights: false,
1219                detect_generated: false,
1220                detect_emails: false,
1221                detect_urls: false,
1222                max_emails: 50,
1223                max_urls: 50,
1224                timeout_seconds: 120.0,
1225            },
1226        );
1227
1228        let scanned = result
1229            .files
1230            .into_iter()
1231            .find(|entry| {
1232                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1233            })
1234            .expect("scanned file entry");
1235
1236        assert!(
1237            scanned.scan_errors.is_empty(),
1238            "scan_errors: {:#?}",
1239            scanned.scan_errors
1240        );
1241        assert_eq!(
1242            scanned.package_data.len(),
1243            1,
1244            "package_data: {:#?}",
1245            scanned.package_data
1246        );
1247        assert_eq!(
1248            scanned.package_data[0].datasource_id,
1249            Some(DatasourceId::RpmArchive)
1250        );
1251        assert_eq!(
1252            scanned.package_data[0].name.as_deref(),
1253            Some("oversized-pack-demo")
1254        );
1255    }
1256
1257    #[test]
1258    fn scanner_parses_oversized_pack_rpm_with_info_without_timeout_or_size_warning() {
1259        let temp_dir = TempDir::new().expect("create temp dir");
1260        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-info-demo");
1261
1262        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1263        let collected = collect_paths(temp_dir.path(), 0, &[]);
1264        let result = process_collected(
1265            &collected,
1266            progress,
1267            None,
1268            LicenseScanOptions::default(),
1269            &TextDetectionOptions {
1270                collect_info: true,
1271                detect_packages: true,
1272                detect_application_packages: true,
1273                detect_system_packages: false,
1274                detect_packages_in_compiled: false,
1275                detect_copyrights: false,
1276                detect_generated: false,
1277                detect_emails: false,
1278                detect_urls: false,
1279                max_emails: 50,
1280                max_urls: 50,
1281                timeout_seconds: 120.0,
1282            },
1283        );
1284
1285        let scanned = result
1286            .files
1287            .into_iter()
1288            .find(|entry| {
1289                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1290            })
1291            .expect("scanned file entry");
1292
1293        assert!(
1294            scanned.scan_errors.is_empty(),
1295            "scan_errors: {:#?}",
1296            scanned.scan_errors
1297        );
1298        assert_eq!(
1299            scanned.package_data.len(),
1300            1,
1301            "package_data: {:#?}",
1302            scanned.package_data
1303        );
1304        assert_eq!(
1305            scanned.package_data[0].datasource_id,
1306            Some(DatasourceId::RpmArchive)
1307        );
1308        assert_eq!(
1309            scanned.package_data[0].name.as_deref(),
1310            Some("oversized-pack-info-demo")
1311        );
1312        assert!(scanned.sha1.is_some());
1313        assert!(scanned.md5.is_some());
1314        assert!(scanned.sha256.is_some());
1315        assert!(scanned.sha1_git.is_some());
1316        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1317        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1318        assert_eq!(scanned.is_binary, Some(true));
1319        assert_eq!(scanned.is_text, Some(false));
1320        assert_eq!(scanned.is_archive, Some(true));
1321    }
1322
1323    #[test]
1324    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1325        let options = TextDetectionOptions {
1326            collect_info: false,
1327            detect_packages: true,
1328            detect_application_packages: false,
1329            detect_system_packages: true,
1330            detect_packages_in_compiled: false,
1331            detect_copyrights: false,
1332            detect_generated: false,
1333            detect_emails: false,
1334            detect_urls: false,
1335            max_emails: 50,
1336            max_urls: 50,
1337            timeout_seconds: 120.0,
1338        };
1339        let scanned = scan_single_file(
1340            "package.json",
1341            r#"{"name":"demo","version":"1.0.0"}"#,
1342            &options,
1343        );
1344
1345        assert!(
1346            scanned.package_data.is_empty(),
1347            "package_data: {:#?}",
1348            scanned.package_data
1349        );
1350    }
1351
1352    #[test]
1353    fn scanner_parses_system_package_files_when_enabled() {
1354        let options = TextDetectionOptions {
1355            collect_info: false,
1356            detect_packages: true,
1357            detect_application_packages: false,
1358            detect_system_packages: true,
1359            detect_packages_in_compiled: false,
1360            detect_copyrights: false,
1361            detect_generated: false,
1362            detect_emails: false,
1363            detect_urls: false,
1364            max_emails: 50,
1365            max_urls: 50,
1366            timeout_seconds: 120.0,
1367        };
1368        let scanned = scan_file_at_relative_path(
1369            "var/lib/dpkg/status",
1370            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1371            &options,
1372        );
1373
1374        assert!(
1375            !scanned.package_data.is_empty(),
1376            "package_data: {:#?}",
1377            scanned.package_data
1378        );
1379    }
1380
1381    #[test]
1382    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1383        if std::process::Command::new("go")
1384            .arg("version")
1385            .status()
1386            .is_err()
1387        {
1388            return;
1389        }
1390
1391        let temp_dir = TempDir::new().expect("create temp dir");
1392        fs::write(
1393            temp_dir.path().join("go.mod"),
1394            "module example.com/demo\n\ngo 1.23.0\n",
1395        )
1396        .expect("write go.mod");
1397        fs::write(
1398            temp_dir.path().join("main.go"),
1399            "package main\nfunc main() {}\n",
1400        )
1401        .expect("write main.go");
1402        let file_path = temp_dir.path().join("demo");
1403        let status = std::process::Command::new("go")
1404            .current_dir(temp_dir.path())
1405            .args(["build", "-o"])
1406            .arg(&file_path)
1407            .status()
1408            .expect("run go build");
1409        assert!(status.success());
1410
1411        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1412        let collected = collect_paths(temp_dir.path(), 0, &[]);
1413
1414        let without_compiled = process_collected(
1415            &collected,
1416            Arc::clone(&progress),
1417            None,
1418            LicenseScanOptions::default(),
1419            &TextDetectionOptions {
1420                collect_info: false,
1421                detect_packages: true,
1422                detect_application_packages: true,
1423                detect_system_packages: false,
1424                detect_packages_in_compiled: false,
1425                detect_copyrights: false,
1426                detect_generated: false,
1427                detect_emails: false,
1428                detect_urls: false,
1429                max_emails: 50,
1430                max_urls: 50,
1431                timeout_seconds: 120.0,
1432            },
1433        );
1434        let with_compiled = process_collected(
1435            &collected,
1436            progress,
1437            None,
1438            LicenseScanOptions::default(),
1439            &TextDetectionOptions {
1440                collect_info: false,
1441                detect_packages: true,
1442                detect_application_packages: true,
1443                detect_system_packages: false,
1444                detect_packages_in_compiled: true,
1445                detect_copyrights: false,
1446                detect_generated: false,
1447                detect_emails: false,
1448                detect_urls: false,
1449                max_emails: 50,
1450                max_urls: 50,
1451                timeout_seconds: 120.0,
1452            },
1453        );
1454
1455        let without_compiled = without_compiled
1456            .files
1457            .into_iter()
1458            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1459            .expect("compiled artifact present");
1460        let with_compiled = with_compiled
1461            .files
1462            .into_iter()
1463            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1464            .expect("compiled artifact present");
1465
1466        assert!(
1467            without_compiled.package_data.is_empty(),
1468            "package_data: {:#?}",
1469            without_compiled.package_data
1470        );
1471        assert!(!with_compiled.package_data.is_empty());
1472    }
1473
1474    #[test]
1475    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1476        let temp_dir = TempDir::new().expect("create temp dir");
1477        let file_path = temp_dir.path().join("libiconv2.dll");
1478        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1479            .expect("read PE fixture");
1480        fs::write(&file_path, fixture).expect("write PE fixture");
1481
1482        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1483        let collected = collect_paths(temp_dir.path(), 0, &[]);
1484
1485        let without_package = process_collected(
1486            &collected,
1487            Arc::clone(&progress),
1488            None,
1489            LicenseScanOptions::default(),
1490            &TextDetectionOptions {
1491                collect_info: false,
1492                detect_packages: false,
1493                detect_application_packages: false,
1494                detect_system_packages: false,
1495                detect_packages_in_compiled: false,
1496                detect_copyrights: false,
1497                detect_generated: false,
1498                detect_emails: false,
1499                detect_urls: false,
1500                max_emails: 50,
1501                max_urls: 50,
1502                timeout_seconds: 120.0,
1503            },
1504        );
1505        let with_package = process_collected(
1506            &collected,
1507            progress,
1508            None,
1509            LicenseScanOptions::default(),
1510            &TextDetectionOptions {
1511                collect_info: false,
1512                detect_packages: true,
1513                detect_application_packages: true,
1514                detect_system_packages: false,
1515                detect_packages_in_compiled: false,
1516                detect_copyrights: false,
1517                detect_generated: false,
1518                detect_emails: false,
1519                detect_urls: false,
1520                max_emails: 50,
1521                max_urls: 50,
1522                timeout_seconds: 120.0,
1523            },
1524        );
1525
1526        let without_package = without_package
1527            .files
1528            .into_iter()
1529            .find(|entry| {
1530                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1531            })
1532            .expect("compiled artifact present");
1533        let with_package = with_package
1534            .files
1535            .into_iter()
1536            .find(|entry| {
1537                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1538            })
1539            .expect("compiled artifact present");
1540
1541        assert!(without_package.package_data.is_empty());
1542        assert_eq!(with_package.package_data.len(), 1);
1543        assert_eq!(
1544            with_package.package_data[0].package_type,
1545            Some(FilePackageType::Winexe)
1546        );
1547        assert_eq!(
1548            with_package.package_data[0].datasource_id,
1549            Some(DatasourceId::WindowsExecutable)
1550        );
1551    }
1552
1553    #[test]
1554    fn scanner_keeps_nsis_and_windows_executable_package_data_together() {
1555        let temp_dir = TempDir::new().expect("create temp dir");
1556        let file_path = temp_dir.path().join("nsis-with-version.exe");
1557        let mut fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1558            .expect("read PE fixture");
1559        if fixture.len() < 70_000 {
1560            fixture.resize(70_000, 0);
1561        }
1562        fixture.extend_from_slice(b"Nullsoft.NSIS.exehead");
1563        fs::write(&file_path, fixture).expect("write synthetic NSIS PE fixture");
1564
1565        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1566        let collected = collect_paths(temp_dir.path(), 0, &[]);
1567        let result = process_collected(
1568            &collected,
1569            progress,
1570            None,
1571            LicenseScanOptions::default(),
1572            &TextDetectionOptions {
1573                collect_info: false,
1574                detect_packages: true,
1575                detect_application_packages: true,
1576                detect_system_packages: false,
1577                detect_packages_in_compiled: false,
1578                detect_copyrights: false,
1579                detect_generated: false,
1580                detect_emails: false,
1581                detect_urls: false,
1582                max_emails: 50,
1583                max_urls: 50,
1584                timeout_seconds: 120.0,
1585            },
1586        );
1587
1588        let scanned = result
1589            .files
1590            .into_iter()
1591            .find(|entry| {
1592                entry.file_type == FileType::File && entry.path.ends_with("/nsis-with-version.exe")
1593            })
1594            .expect("compiled artifact present");
1595
1596        assert_eq!(
1597            scanned.package_data.len(),
1598            2,
1599            "package_data: {:#?}",
1600            scanned.package_data
1601        );
1602        assert!(
1603            scanned
1604                .package_data
1605                .iter()
1606                .any(|pkg| pkg.datasource_id == Some(DatasourceId::NsisInstaller))
1607        );
1608        assert!(
1609            scanned
1610                .package_data
1611                .iter()
1612                .any(|pkg| pkg.datasource_id == Some(DatasourceId::WindowsExecutable))
1613        );
1614    }
1615
1616    #[test]
1617    fn scanner_detects_license_from_font_metadata() {
1618        let temp_dir = TempDir::new().expect("create temp dir");
1619        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1620        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1621        fs::write(&file_path, fixture).expect("write font fixture");
1622
1623        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1624        let collected = collect_paths(temp_dir.path(), 0, &[]);
1625        let engine =
1626            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1627        let result = process_collected(
1628            &collected,
1629            progress,
1630            Some(engine),
1631            LicenseScanOptions::default(),
1632            &TextDetectionOptions::default(),
1633        );
1634        let scanned = result
1635            .files
1636            .into_iter()
1637            .find(|entry| {
1638                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1639            })
1640            .expect("scanned file entry");
1641
1642        assert!(
1643            scanned.license_expression.is_some(),
1644            "license detections: {:#?}",
1645            scanned.license_detections
1646        );
1647        assert!(
1648            scanned
1649                .license_expression
1650                .as_deref()
1651                .is_some_and(
1652                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1653                ),
1654            "license expression: {:?}",
1655            scanned.license_expression
1656        );
1657    }
1658
1659    #[test]
1660    fn scanner_detects_license_from_windows_executable_metadata() {
1661        let temp_dir = TempDir::new().expect("create temp dir");
1662        let file_path = temp_dir.path().join("libiconv2.dll");
1663        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1664            .expect("read PE fixture");
1665        fs::write(&file_path, fixture).expect("write PE fixture");
1666
1667        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1668        let collected = collect_paths(temp_dir.path(), 0, &[]);
1669        let engine =
1670            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1671        let result = process_collected(
1672            &collected,
1673            progress,
1674            Some(engine),
1675            LicenseScanOptions::default(),
1676            &TextDetectionOptions::default(),
1677        );
1678        let scanned = result
1679            .files
1680            .into_iter()
1681            .find(|entry| {
1682                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1683            })
1684            .expect("scanned file entry");
1685
1686        assert!(
1687            scanned.license_expression.is_some(),
1688            "license detections: {:#?}",
1689            scanned.license_detections
1690        );
1691        assert!(
1692            scanned
1693                .license_expression
1694                .as_deref()
1695                .is_some_and(|expression| {
1696                    expression.contains("lgpl") || expression.contains("LGPL")
1697                }),
1698            "license expression: {:?}",
1699            scanned.license_expression
1700        );
1701    }
1702
1703    #[test]
1704    fn scanner_detects_license_from_windows_executable_security_notice() {
1705        fn synthetic_pe_with_security_notice(notice: &str) -> Vec<u8> {
1706            let cert_payload = notice
1707                .encode_utf16()
1708                .flat_map(|unit| unit.to_le_bytes())
1709                .collect::<Vec<_>>();
1710            let cert_len = (8 + cert_payload.len()) as u32;
1711            let mut cert = Vec::new();
1712            cert.extend_from_slice(&cert_len.to_le_bytes());
1713            cert.extend_from_slice(&0x0200u16.to_le_bytes());
1714            cert.extend_from_slice(&0x0002u16.to_le_bytes());
1715            cert.extend_from_slice(&cert_payload);
1716            while !cert.len().is_multiple_of(8) {
1717                cert.push(0);
1718            }
1719
1720            let offset = 0x200usize;
1721            let size = cert.len();
1722            let optional_header_size = 224usize;
1723            let pe_header_offset = 0x80usize;
1724            let nt_headers_offset = pe_header_offset + 4;
1725            let optional_header_offset = nt_headers_offset + 20;
1726            let data_directory_offset = optional_header_offset + 96;
1727            let security_directory_offset =
1728                data_directory_offset + pe::IMAGE_DIRECTORY_ENTRY_SECURITY * 8;
1729            let total_len = offset + size;
1730            let mut bytes = vec![0u8; total_len];
1731
1732            bytes[0..2].copy_from_slice(b"MZ");
1733            bytes[0x3c..0x40].copy_from_slice(&(pe_header_offset as u32).to_le_bytes());
1734            bytes[pe_header_offset..pe_header_offset + 4].copy_from_slice(b"PE\0\0");
1735
1736            bytes[nt_headers_offset..nt_headers_offset + 2]
1737                .copy_from_slice(&0x014cu16.to_le_bytes());
1738            bytes[nt_headers_offset + 16..nt_headers_offset + 18]
1739                .copy_from_slice(&(optional_header_size as u16).to_le_bytes());
1740
1741            bytes[optional_header_offset..optional_header_offset + 2]
1742                .copy_from_slice(&0x010bu16.to_le_bytes());
1743            bytes[optional_header_offset + 92..optional_header_offset + 96]
1744                .copy_from_slice(&16u32.to_le_bytes());
1745            bytes[security_directory_offset..security_directory_offset + 4]
1746                .copy_from_slice(&(offset as u32).to_le_bytes());
1747            bytes[security_directory_offset + 4..security_directory_offset + 8]
1748                .copy_from_slice(&(size as u32).to_le_bytes());
1749            bytes[offset..offset + size].copy_from_slice(&cert);
1750
1751            bytes
1752        }
1753
1754        let temp_dir = TempDir::new().expect("create temp dir");
1755        let file_path = temp_dir.path().join("signed.dll");
1756        let fixture = synthetic_pe_with_security_notice(
1757            "use of this Certificate constitutes acceptance of the DigiCert CP/CPS and the Relying Party Agreement which limit liability and are incorporated herein by reference.",
1758        );
1759        fs::write(&file_path, fixture).expect("write PE fixture");
1760
1761        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1762        let collected = collect_paths(temp_dir.path(), 0, &[]);
1763        let engine =
1764            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1765        let result = process_collected(
1766            &collected,
1767            progress,
1768            Some(engine),
1769            LicenseScanOptions::default(),
1770            &TextDetectionOptions::default(),
1771        );
1772        let scanned = result
1773            .files
1774            .into_iter()
1775            .find(|entry| {
1776                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1777            })
1778            .expect("scanned file entry");
1779
1780        assert!(
1781            scanned
1782                .license_expression
1783                .as_deref()
1784                .is_some_and(|expression| expression.contains("proprietary-license")),
1785            "license expression: {:?}, detections: {:#?}",
1786            scanned.license_expression,
1787            scanned.license_detections
1788        );
1789    }
1790
1791    #[test]
1792    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1793        let scanned = scan_single_file_with_license_engine(
1794            "navbar.md",
1795            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1796            &TextDetectionOptions::default(),
1797        );
1798
1799        assert!(
1800            scanned
1801                .license_expression
1802                .as_deref()
1803                .is_some_and(|expression| {
1804                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1805                }),
1806            "license expression: {:?}",
1807            scanned.license_expression
1808        );
1809    }
1810
1811    #[test]
1812    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1813        let scanned = scan_single_file_with_license_engine(
1814            "README.md",
1815            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1816            &TextDetectionOptions::default(),
1817        );
1818
1819        assert!(
1820            scanned
1821                .license_expression
1822                .as_deref()
1823                .is_some_and(|expression| {
1824                    expression.contains("mit") || expression.contains("MIT")
1825                }),
1826            "license expression: {:?}",
1827            scanned.license_expression
1828        );
1829    }
1830
1831    #[test]
1832    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1833        let scanned = scan_single_file_with_license_engine(
1834            "README.md",
1835            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1836            &TextDetectionOptions::default(),
1837        );
1838
1839        assert!(
1840            scanned
1841                .license_expression
1842                .as_deref()
1843                .is_some_and(|expression| {
1844                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1845                }),
1846            "license expression: {:?}",
1847            scanned.license_expression
1848        );
1849    }
1850
1851    #[test]
1852    fn scanner_sets_is_source_only_when_info_enabled() {
1853        let without_info = TextDetectionOptions {
1854            collect_info: false,
1855            detect_packages: false,
1856            detect_application_packages: false,
1857            detect_system_packages: false,
1858            detect_packages_in_compiled: false,
1859            detect_copyrights: false,
1860            detect_generated: false,
1861            detect_emails: false,
1862            detect_urls: false,
1863            max_emails: 50,
1864            max_urls: 50,
1865            timeout_seconds: 120.0,
1866        };
1867        let with_info = TextDetectionOptions {
1868            collect_info: true,
1869            ..without_info.clone()
1870        };
1871
1872        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1873        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1874
1875        assert_eq!(scanned_without_info.is_source, None);
1876        assert_eq!(scanned_with_info.is_source, Some(true));
1877    }
1878
1879    #[test]
1880    fn directory_omits_info_fields_when_info_disabled() {
1881        let temp_dir = TempDir::new().expect("create temp dir");
1882        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1883
1884        let collected = collect_paths(temp_dir.path(), 0, &[]);
1885        let result = process_collected(
1886            &collected,
1887            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1888            None,
1889            LicenseScanOptions::default(),
1890            &TextDetectionOptions {
1891                collect_info: false,
1892                detect_packages: false,
1893                detect_application_packages: false,
1894                detect_system_packages: false,
1895                detect_packages_in_compiled: false,
1896                detect_copyrights: false,
1897                detect_generated: false,
1898                detect_emails: false,
1899                detect_urls: false,
1900                max_emails: 50,
1901                max_urls: 50,
1902                timeout_seconds: 120.0,
1903            },
1904        );
1905
1906        let directory = result
1907            .files
1908            .into_iter()
1909            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1910            .expect("directory entry");
1911
1912        assert!(directory.date.is_none());
1913        assert!(directory.file_type_label.is_none());
1914        assert!(directory.is_binary.is_none());
1915        assert!(directory.is_text.is_none());
1916        assert!(directory.is_archive.is_none());
1917        assert!(directory.is_media.is_none());
1918        assert!(directory.is_source.is_none());
1919        assert!(directory.is_script.is_none());
1920    }
1921
1922    #[test]
1923    fn directory_includes_info_fields_when_info_enabled() {
1924        let temp_dir = TempDir::new().expect("create temp dir");
1925        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1926
1927        let collected = collect_paths(temp_dir.path(), 0, &[]);
1928        let result = process_collected(
1929            &collected,
1930            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1931            None,
1932            LicenseScanOptions::default(),
1933            &TextDetectionOptions {
1934                collect_info: true,
1935                detect_packages: false,
1936                detect_application_packages: false,
1937                detect_system_packages: false,
1938                detect_packages_in_compiled: false,
1939                detect_copyrights: false,
1940                detect_generated: false,
1941                detect_emails: false,
1942                detect_urls: false,
1943                max_emails: 50,
1944                max_urls: 50,
1945                timeout_seconds: 120.0,
1946            },
1947        );
1948
1949        let directory = result
1950            .files
1951            .into_iter()
1952            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1953            .expect("directory entry");
1954
1955        assert!(directory.date.is_none());
1956        assert!(directory.file_type_label.is_none());
1957        assert_eq!(directory.is_binary, Some(false));
1958        assert_eq!(directory.is_text, Some(false));
1959        assert_eq!(directory.is_archive, Some(false));
1960        assert_eq!(directory.is_media, Some(false));
1961        assert_eq!(directory.is_source, Some(false));
1962        assert_eq!(directory.is_script, Some(false));
1963        assert_eq!(directory.files_count, Some(0));
1964        assert_eq!(directory.dirs_count, Some(0));
1965        assert_eq!(directory.size_count, Some(0));
1966    }
1967
1968    #[test]
1969    fn collect_paths_includes_root_directory_entry() {
1970        let temp_dir = TempDir::new().expect("create temp dir");
1971        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1972        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1973            .expect("write nested file");
1974
1975        let collected = collect_paths(temp_dir.path(), 0, &[]);
1976
1977        assert!(
1978            collected
1979                .directories
1980                .iter()
1981                .any(|(path, _)| path == temp_dir.path())
1982        );
1983    }
1984
1985    #[test]
1986    fn collect_paths_supports_single_file_input() {
1987        let temp_dir = TempDir::new().expect("create temp dir");
1988        let file_path = temp_dir.path().join("main.rs");
1989        fs::write(&file_path, "fn main() {}\n").expect("write file");
1990
1991        let collected = collect_paths(&file_path, 0, &[]);
1992
1993        assert_eq!(collected.files.len(), 1);
1994        assert!(collected.directories.is_empty());
1995        assert_eq!(collected.files[0].0, file_path);
1996    }
1997
1998    #[cfg(unix)]
1999    #[test]
2000    fn collect_selected_paths_does_not_walk_unselected_siblings() {
2001        use std::os::unix::fs::PermissionsExt;
2002
2003        let temp_dir = TempDir::new().expect("create temp dir");
2004        let root = temp_dir.path();
2005        fs::create_dir_all(root.join("selected/docs")).expect("create selected dir");
2006        fs::create_dir_all(root.join("blocked/secret")).expect("create blocked dir");
2007        fs::write(root.join("selected/docs/guide.md"), "# guide\n").expect("write guide");
2008
2009        let blocked = root.join("blocked");
2010        let mut perms = fs::metadata(&blocked)
2011            .expect("blocked metadata")
2012            .permissions();
2013        perms.set_mode(0o000);
2014        fs::set_permissions(&blocked, perms).expect("remove blocked permissions");
2015
2016        let collected = collect_selected_paths(
2017            root,
2018            &[CollectionFrontier {
2019                path: PathBuf::from("selected"),
2020                recurse: true,
2021            }],
2022            0,
2023            &[],
2024        );
2025
2026        let mut restore = fs::metadata(&blocked)
2027            .expect("blocked metadata")
2028            .permissions();
2029        restore.set_mode(0o755);
2030        fs::set_permissions(&blocked, restore).expect("restore blocked permissions");
2031
2032        assert!(
2033            collected.collection_errors.is_empty(),
2034            "{:#?}",
2035            collected.collection_errors
2036        );
2037        assert!(
2038            collected
2039                .files
2040                .iter()
2041                .any(|(path, _)| path == &root.join("selected/docs/guide.md"))
2042        );
2043        assert!(
2044            collected
2045                .files
2046                .iter()
2047                .all(|(path, _): &(PathBuf, fs::Metadata)| !path.starts_with(&blocked))
2048        );
2049    }
2050
2051    #[test]
2052    fn collect_selected_paths_respects_excluded_ancestor_directories() {
2053        let temp_dir = TempDir::new().expect("create temp dir");
2054        let root = temp_dir.path();
2055        fs::create_dir_all(root.join(".git")).expect("create git dir");
2056        fs::write(
2057            root.join(".git/config"),
2058            "[core]\nrepositoryformatversion = 0\n",
2059        )
2060        .expect("write git config");
2061
2062        let exclude_patterns =
2063            build_collection_exclude_patterns(root, &root.join(".provenant-cache"));
2064        let collected = collect_selected_paths(
2065            root,
2066            &[CollectionFrontier {
2067                path: PathBuf::from(".git/config"),
2068                recurse: false,
2069            }],
2070            0,
2071            &exclude_patterns,
2072        );
2073
2074        assert!(collected.files.is_empty());
2075        assert!(collected.directories.iter().all(|(path, _)| path == root));
2076        assert_eq!(collected.excluded_count, 1);
2077    }
2078
2079    #[test]
2080    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
2081        let temp_dir = TempDir::new().expect("create temp dir");
2082        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2083        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
2084
2085        let collected = collect_paths(temp_dir.path(), 0, &[]);
2086        let result = process_collected_with_memory_limit(
2087            &collected,
2088            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2089            None,
2090            LicenseScanOptions::default(),
2091            &TextDetectionOptions {
2092                collect_info: false,
2093                detect_packages: false,
2094                detect_application_packages: false,
2095                detect_system_packages: false,
2096                detect_packages_in_compiled: false,
2097                detect_copyrights: false,
2098                detect_generated: false,
2099                detect_emails: false,
2100                detect_urls: false,
2101                max_emails: 50,
2102                max_urls: 50,
2103                timeout_seconds: 120.0,
2104            },
2105            MemoryMode::Limit(1),
2106        );
2107
2108        assert_eq!(result.files.len(), 3);
2109    }
2110
2111    #[test]
2112    fn process_collected_with_negative_one_uses_disk_only_mode() {
2113        let temp_dir = TempDir::new().expect("create temp dir");
2114        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2115
2116        let collected = collect_paths(temp_dir.path(), 0, &[]);
2117        let result = process_collected_with_memory_limit(
2118            &collected,
2119            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2120            None,
2121            LicenseScanOptions::default(),
2122            &TextDetectionOptions {
2123                collect_info: false,
2124                detect_packages: false,
2125                detect_application_packages: false,
2126                detect_system_packages: false,
2127                detect_packages_in_compiled: false,
2128                detect_copyrights: false,
2129                detect_generated: false,
2130                detect_emails: false,
2131                detect_urls: false,
2132                max_emails: 50,
2133                max_urls: 50,
2134                timeout_seconds: 120.0,
2135            },
2136            MemoryMode::StreamUnlimited,
2137        );
2138
2139        assert_eq!(result.files.len(), 2);
2140    }
2141}