Skip to main content

provenant/scanner/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod collect;
5mod process;
6
7use crate::license_detection::LicenseDetectionEngine;
8use crate::models::FileInfo;
9
10pub struct ProcessResult {
11    pub files: Vec<FileInfo>,
12    pub excluded_count: usize,
13}
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct LicenseScanOptions {
17    pub include_text: bool,
18    pub include_text_diagnostics: bool,
19    pub include_diagnostics: bool,
20    pub unknown_licenses: bool,
21    pub min_score: u8,
22}
23
24#[derive(Debug, Clone)]
25pub struct TextDetectionOptions {
26    pub collect_info: bool,
27    pub detect_packages: bool,
28    pub detect_application_packages: bool,
29    pub detect_system_packages: bool,
30    pub detect_packages_in_compiled: bool,
31    pub detect_copyrights: bool,
32    pub detect_generated: bool,
33    pub detect_emails: bool,
34    pub detect_urls: bool,
35    pub max_emails: usize,
36    pub max_urls: usize,
37    pub timeout_seconds: f64,
38}
39
40impl Default for TextDetectionOptions {
41    fn default() -> Self {
42        Self {
43            collect_info: false,
44            detect_packages: false,
45            detect_application_packages: false,
46            detect_system_packages: false,
47            detect_packages_in_compiled: false,
48            detect_copyrights: true,
49            detect_generated: false,
50            detect_emails: false,
51            detect_urls: false,
52            max_emails: 50,
53            max_urls: 50,
54            timeout_seconds: 120.0,
55        }
56    }
57}
58
59pub fn scan_options_fingerprint(
60    text_options: &TextDetectionOptions,
61    license_options: LicenseScanOptions,
62    license_engine: Option<&LicenseDetectionEngine>,
63) -> String {
64    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
65        Some(engine) => {
66            let rules = &engine.index().rules_by_rid;
67            (
68                true,
69                rules.len(),
70                rules
71                    .first()
72                    .map(|rule| rule.identifier.as_str())
73                    .unwrap_or(""),
74                rules
75                    .last()
76                    .map(|rule| rule.identifier.as_str())
77                    .unwrap_or(""),
78            )
79        }
80        None => (false, 0, "", ""),
81    };
82
83    format!(
84        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
85        crate::version::BUILD_VERSION,
86        text_options.collect_info,
87        text_options.detect_packages,
88        text_options.detect_application_packages,
89        text_options.detect_system_packages,
90        text_options.detect_packages_in_compiled,
91        text_options.detect_copyrights,
92        text_options.detect_generated,
93        text_options.detect_emails,
94        text_options.detect_urls,
95        text_options.max_emails,
96        text_options.max_urls,
97        text_options.timeout_seconds,
98        license_enabled,
99        rules_count,
100        first_rule_id,
101        last_rule_id,
102        license_options.include_text,
103        license_options.include_text_diagnostics,
104        license_options.include_diagnostics,
105        license_options.unknown_licenses,
106        license_options.min_score,
107    )
108}
109
110pub use self::collect::{
111    CollectedPaths, CollectionFrontier, collect_paths, collect_selected_paths,
112};
113#[allow(unused_imports)]
114pub use self::process::{
115    MemoryMode, process_collected, process_collected_sequential,
116    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
117};
118
119#[cfg(test)]
120mod tests {
121    use std::fs;
122    use std::path::PathBuf;
123    use std::sync::Arc;
124
125    use tempfile::TempDir;
126
127    use crate::cache::build_collection_exclude_patterns;
128    use crate::license_detection::LicenseDetectionEngine;
129    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
130    use crate::progress::{ProgressMode, ScanProgress};
131
132    use super::{
133        CollectionFrontier, LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths,
134        collect_selected_paths, process_collected, process_collected_with_memory_limit,
135        scan_options_fingerprint,
136    };
137
138    fn build_sparse_oversized_rpm_with_filename(
139        temp_dir: &TempDir,
140        package_name: &str,
141        filename: &str,
142    ) -> PathBuf {
143        let file_path = temp_dir.path().join(filename);
144        rpm::PackageBuilder::new(package_name, "1.0", "MIT", "x86_64", "Demo RPM package")
145            .release("1")
146            .build()
147            .expect("build rpm fixture")
148            .write_file(&file_path)
149            .expect("write rpm fixture");
150        fs::OpenOptions::new()
151            .write(true)
152            .open(&file_path)
153            .expect("open rpm fixture for sparse extension")
154            .set_len(100 * 1024 * 1024 + 1_048_576)
155            .expect("extend rpm fixture");
156        file_path
157    }
158
159    fn build_sparse_oversized_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
160        build_sparse_oversized_rpm_with_filename(
161            temp_dir,
162            name,
163            &format!("{name}-1.0-1.x86_64.rpm"),
164        )
165    }
166
167    fn build_sparse_oversized_pack_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
168        build_sparse_oversized_rpm_with_filename(
169            temp_dir,
170            name,
171            &format!("{name}-1.0-1.x86_64.pack"),
172        )
173    }
174
175    #[test]
176    fn default_options_keep_copyright_detection_enabled() {
177        let options = TextDetectionOptions::default();
178        assert!(!options.detect_packages);
179        assert!(options.detect_copyrights);
180    }
181
182    #[test]
183    fn test_scan_options_fingerprint_changes_with_license_score() {
184        let text_options = TextDetectionOptions::default();
185        let default_fingerprint = scan_options_fingerprint(
186            &text_options,
187            LicenseScanOptions {
188                min_score: 0,
189                ..LicenseScanOptions::default()
190            },
191            None,
192        );
193        let filtered_fingerprint = scan_options_fingerprint(
194            &text_options,
195            LicenseScanOptions {
196                min_score: 70,
197                ..LicenseScanOptions::default()
198            },
199            None,
200        );
201
202        assert_ne!(default_fingerprint, filtered_fingerprint);
203    }
204
205    fn scan_single_file(
206        file_name: &str,
207        content: &str,
208        options: &TextDetectionOptions,
209    ) -> crate::models::FileInfo {
210        let temp_dir = TempDir::new().expect("create temp dir");
211        let file_path = temp_dir.path().join(file_name);
212        fs::write(&file_path, content).expect("write test file");
213
214        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
215        let collected = collect_paths(temp_dir.path(), 0, &[]);
216        let result = process_collected(
217            &collected,
218            progress,
219            None,
220            LicenseScanOptions::default(),
221            options,
222        );
223
224        result
225            .files
226            .into_iter()
227            .find(|entry| {
228                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
229            })
230            .expect("scanned file entry")
231    }
232
233    fn scan_file_at_relative_path(
234        relative_path: &str,
235        content: &[u8],
236        options: &TextDetectionOptions,
237    ) -> crate::models::FileInfo {
238        let temp_dir = TempDir::new().expect("create temp dir");
239        let file_path = temp_dir.path().join(relative_path);
240        if let Some(parent) = file_path.parent() {
241            fs::create_dir_all(parent).expect("create parent dirs");
242        }
243        fs::write(&file_path, content).expect("write test file");
244
245        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
246        let collected = collect_paths(temp_dir.path(), 0, &[]);
247        let result = process_collected(
248            &collected,
249            progress,
250            None,
251            LicenseScanOptions::default(),
252            options,
253        );
254
255        result
256            .files
257            .into_iter()
258            .find(|entry| {
259                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
260            })
261            .expect("scanned file entry")
262    }
263
264    fn scan_single_file_with_license_engine(
265        file_name: &str,
266        content: &str,
267        options: &TextDetectionOptions,
268    ) -> crate::models::FileInfo {
269        let temp_dir = TempDir::new().expect("create temp dir");
270        let file_path = temp_dir.path().join(file_name);
271        fs::write(&file_path, content).expect("write test file");
272
273        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
274        let collected = collect_paths(temp_dir.path(), 0, &[]);
275        let engine =
276            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
277        let result = process_collected(
278            &collected,
279            progress,
280            Some(engine),
281            LicenseScanOptions::default(),
282            options,
283        );
284
285        result
286            .files
287            .into_iter()
288            .find(|entry| {
289                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
290            })
291            .expect("scanned file entry")
292    }
293
294    #[test]
295    fn scanner_reports_repeated_email_occurrences() {
296        let options = TextDetectionOptions {
297            collect_info: false,
298            detect_packages: false,
299            detect_application_packages: false,
300            detect_system_packages: false,
301            detect_packages_in_compiled: false,
302            detect_copyrights: false,
303            detect_generated: false,
304            detect_emails: true,
305            detect_urls: false,
306            max_emails: 50,
307            max_urls: 50,
308            timeout_seconds: 120.0,
309        };
310        let scanned = scan_single_file(
311            "contacts.txt",
312            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
313            &options,
314        );
315
316        let emails: Vec<(&str, usize)> = scanned
317            .emails
318            .iter()
319            .map(|email| (email.email.as_str(), email.start_line.get()))
320            .collect();
321
322        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
323        assert_eq!(
324            emails,
325            vec![
326                ("linux@3ware.com", 1),
327                ("linux@3ware.com", 2),
328                ("andre@suse.com", 3),
329                ("linux@3ware.com", 4),
330            ]
331        );
332    }
333
334    #[test]
335    fn scanner_skips_pem_certificate_text_detection() {
336        let options = TextDetectionOptions {
337            collect_info: false,
338            detect_packages: false,
339            detect_application_packages: false,
340            detect_system_packages: false,
341            detect_packages_in_compiled: false,
342            detect_copyrights: true,
343            detect_generated: false,
344            detect_emails: true,
345            detect_urls: true,
346            max_emails: 50,
347            max_urls: 50,
348            timeout_seconds: 120.0,
349        };
350        let pem_fixture = concat!(
351            "-----BEGIN CERTIFICATE-----\n",
352            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
353            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
354            "-----END CERTIFICATE-----\n",
355            "Certificate:\n",
356            "    Data:\n",
357            "        Signature Algorithm: sha1WithRSAEncryption\n",
358            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
359            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
360            "        Contact: cert-owner@example.com\n",
361        );
362        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
363
364        assert!(
365            scanned.copyrights.is_empty(),
366            "copyrights: {:#?}",
367            scanned.copyrights
368        );
369        assert!(
370            scanned.holders.is_empty(),
371            "holders: {:#?}",
372            scanned.holders
373        );
374        assert!(
375            scanned.authors.is_empty(),
376            "authors: {:#?}",
377            scanned.authors
378        );
379        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
380        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
381        assert!(
382            scanned.license_detections.is_empty(),
383            "licenses: {:#?}",
384            scanned.license_detections
385        );
386        assert!(
387            scanned.license_clues.is_empty(),
388            "license clues: {:#?}",
389            scanned.license_clues
390        );
391    }
392
393    #[test]
394    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
395        let options = TextDetectionOptions {
396            collect_info: false,
397            detect_packages: false,
398            detect_application_packages: false,
399            detect_system_packages: false,
400            detect_packages_in_compiled: false,
401            detect_copyrights: true,
402            detect_generated: false,
403            detect_emails: false,
404            detect_urls: true,
405            max_emails: 50,
406            max_urls: 50,
407            timeout_seconds: 120.0,
408        };
409        let fixture = concat!(
410            "/*\n",
411            "Copyright 2022 The Kubernetes Authors.\n\n",
412            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
413            "you may not use this file except in compliance with the License.\n",
414            "You may obtain a copy of the License at\n\n",
415            "    http://www.apache.org/licenses/LICENSE-2.0\n",
416            "*/\n\n",
417            "package storage\n\n",
418            "const validCert = `\n",
419            "-----BEGIN CERTIFICATE-----\n",
420            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
421            "-----END CERTIFICATE-----\n",
422            "`\n",
423        );
424        let temp_dir = TempDir::new().expect("create temp dir");
425        let file_path = temp_dir.path().join("storage_test.go");
426        fs::write(&file_path, fixture).expect("write fixture");
427
428        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
429        let collected = collect_paths(temp_dir.path(), 0, &[]);
430        let engine =
431            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
432        let result = process_collected(
433            &collected,
434            progress,
435            Some(engine),
436            LicenseScanOptions::default(),
437            &options,
438        );
439        let scanned = result
440            .files
441            .into_iter()
442            .find(|entry| {
443                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
444            })
445            .expect("scanned file entry");
446
447        assert!(
448            scanned
449                .copyrights
450                .iter()
451                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
452            "copyrights: {:#?}",
453            scanned.copyrights
454        );
455        assert!(
456            scanned
457                .holders
458                .iter()
459                .any(|h| h.holder == "The Kubernetes Authors"),
460            "holders: {:#?}",
461            scanned.holders
462        );
463        assert!(
464            scanned
465                .urls
466                .iter()
467                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
468            "urls: {:#?}",
469            scanned.urls
470        );
471        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
472    }
473
474    #[test]
475    fn scanner_detects_structured_credits_authors() {
476        let options = TextDetectionOptions {
477            collect_info: false,
478            detect_packages: false,
479            detect_application_packages: false,
480            detect_system_packages: false,
481            detect_packages_in_compiled: false,
482            detect_copyrights: true,
483            detect_generated: false,
484            detect_emails: false,
485            detect_urls: false,
486            max_emails: 50,
487            max_urls: 50,
488            timeout_seconds: 120.0,
489        };
490        let credits_fixture = concat!(
491            "N: Jack Lloyd\n",
492            "E: lloyd@randombit.net\n",
493            "W: http://www.randombit.net/\n",
494        );
495        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
496
497        let authors: Vec<(&str, usize, usize)> = scanned
498            .authors
499            .iter()
500            .map(|author| {
501                (
502                    author.author.as_str(),
503                    author.start_line.get(),
504                    author.end_line.get(),
505                )
506            })
507            .collect();
508
509        assert_eq!(
510            authors,
511            vec![(
512                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
513                1,
514                3,
515            )]
516        );
517        assert!(scanned.copyrights.is_empty());
518        assert!(scanned.holders.is_empty());
519    }
520
521    #[test]
522    fn scanner_uses_or_for_alternative_license_header() {
523        let fixture =
524            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
525        let temp_dir = TempDir::new().expect("create temp dir");
526        let file_path = temp_dir.path().join("d2s.ipp");
527        fs::write(&file_path, fixture).expect("write fixture");
528
529        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
530        let collected = collect_paths(temp_dir.path(), 0, &[]);
531        let engine =
532            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
533        let result = process_collected(
534            &collected,
535            progress,
536            Some(engine),
537            LicenseScanOptions::default(),
538            &TextDetectionOptions::default(),
539        );
540        let scanned = result
541            .files
542            .into_iter()
543            .find(|entry| {
544                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
545            })
546            .expect("scanned file entry");
547
548        assert_eq!(
549            scanned.license_expression.as_deref(),
550            Some("Apache-2.0 OR BSL-1.0")
551        );
552        assert!(
553            scanned.license_clues.is_empty(),
554            "license clues: {:#?}",
555            scanned.license_clues
556        );
557        assert_eq!(
558            scanned.license_detections.len(),
559            1,
560            "detections: {:#?}",
561            scanned.license_detections
562        );
563
564        let detection = &scanned.license_detections[0];
565        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
566
567        let match_expressions: Vec<_> = detection
568            .matches
569            .iter()
570            .map(|m| m.license_expression_spdx.as_str())
571            .collect();
572        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
573    }
574
575    #[test]
576    fn scanner_sets_generated_flag_when_enabled() {
577        let options = TextDetectionOptions {
578            collect_info: false,
579            detect_packages: false,
580            detect_application_packages: false,
581            detect_system_packages: false,
582            detect_packages_in_compiled: false,
583            detect_copyrights: false,
584            detect_generated: true,
585            detect_emails: false,
586            detect_urls: false,
587            max_emails: 50,
588            max_urls: 50,
589            timeout_seconds: 120.0,
590        };
591        let scanned = scan_single_file(
592            "generated.c",
593            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
594            &options,
595        );
596
597        assert_eq!(scanned.is_generated, Some(true));
598    }
599
600    #[test]
601    fn scanner_leaves_generated_flag_unset_when_disabled() {
602        let options = TextDetectionOptions {
603            collect_info: false,
604            detect_packages: false,
605            detect_application_packages: false,
606            detect_system_packages: false,
607            detect_packages_in_compiled: false,
608            detect_copyrights: false,
609            detect_generated: false,
610            detect_emails: false,
611            detect_urls: false,
612            max_emails: 50,
613            max_urls: 50,
614            timeout_seconds: 120.0,
615        };
616        let scanned = scan_single_file(
617            "generated.c",
618            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
619            &options,
620        );
621
622        assert_eq!(scanned.is_generated, None);
623    }
624
625    #[test]
626    fn scanner_populates_info_surface_when_enabled() {
627        let options = TextDetectionOptions {
628            collect_info: true,
629            detect_packages: false,
630            detect_application_packages: false,
631            detect_system_packages: false,
632            detect_packages_in_compiled: false,
633            detect_copyrights: false,
634            detect_generated: false,
635            detect_emails: false,
636            detect_urls: false,
637            max_emails: 50,
638            max_urls: 50,
639            timeout_seconds: 120.0,
640        };
641        let scanned = scan_single_file(
642            "script.py",
643            "#!/usr/bin/env python3\nprint(\"hello\")\n",
644            &options,
645        );
646
647        assert!(scanned.sha1.is_some());
648        assert!(scanned.md5.is_some());
649        assert!(scanned.sha256.is_some());
650        assert!(scanned.sha1_git.is_some());
651        assert!(scanned.mime_type.is_some());
652        assert!(scanned.date.is_some());
653        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
654        assert_eq!(scanned.is_text, Some(true));
655        assert_eq!(scanned.is_script, Some(true));
656        assert_eq!(scanned.is_source, Some(true));
657    }
658
659    #[test]
660    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
661        let options = TextDetectionOptions {
662            collect_info: true,
663            detect_packages: false,
664            detect_application_packages: false,
665            detect_system_packages: false,
666            detect_packages_in_compiled: false,
667            detect_copyrights: false,
668            detect_generated: false,
669            detect_emails: false,
670            detect_urls: false,
671            max_emails: 50,
672            max_urls: 50,
673            timeout_seconds: 120.0,
674        };
675        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
676        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
677
678        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
679        assert_eq!(
680            scanned.file_type_label.as_deref(),
681            Some("python script, text executable")
682        );
683        assert_eq!(scanned.is_binary, Some(false));
684        assert_eq!(scanned.is_text, Some(true));
685        assert_eq!(scanned.is_script, Some(true));
686        assert_eq!(scanned.is_source, Some(true));
687    }
688
689    #[test]
690    fn scanner_skips_findings_for_zip_like_archives() {
691        let options = TextDetectionOptions {
692            collect_info: true,
693            detect_packages: false,
694            detect_application_packages: false,
695            detect_system_packages: false,
696            detect_packages_in_compiled: false,
697            detect_copyrights: true,
698            detect_generated: false,
699            detect_emails: true,
700            detect_urls: true,
701            max_emails: 50,
702            max_urls: 50,
703            timeout_seconds: 120.0,
704        };
705        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
706        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
707
708        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
709        assert_eq!(scanned.is_archive, Some(true));
710        assert!(scanned.license_detections.is_empty());
711        assert!(scanned.copyrights.is_empty());
712        assert!(scanned.emails.is_empty());
713        assert!(scanned.urls.is_empty());
714    }
715
716    #[test]
717    fn scanner_treats_typescript_sources_as_text_not_video_media() {
718        let options = TextDetectionOptions {
719            collect_info: true,
720            detect_packages: false,
721            detect_application_packages: false,
722            detect_system_packages: false,
723            detect_packages_in_compiled: false,
724            detect_copyrights: false,
725            detect_generated: false,
726            detect_emails: false,
727            detect_urls: false,
728            max_emails: 50,
729            max_urls: 50,
730            timeout_seconds: 120.0,
731        };
732        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
733
734        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
735        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
736        assert_eq!(
737            scanned.file_type_label.as_deref(),
738            Some("UTF-8 Unicode text")
739        );
740        assert_eq!(scanned.is_text, Some(true));
741        assert_eq!(scanned.is_media, Some(false));
742        assert_eq!(scanned.is_script, Some(false));
743        assert_eq!(scanned.is_source, Some(true));
744    }
745
746    #[test]
747    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
748        let options = TextDetectionOptions {
749            collect_info: true,
750            detect_packages: false,
751            detect_application_packages: false,
752            detect_system_packages: false,
753            detect_packages_in_compiled: false,
754            detect_copyrights: false,
755            detect_generated: false,
756            detect_emails: false,
757            detect_urls: false,
758            max_emails: 50,
759            max_urls: 50,
760            timeout_seconds: 120.0,
761        };
762        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
763
764        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
765        assert_eq!(
766            scanned.file_type_label.as_deref(),
767            Some("UTF-8 Unicode text")
768        );
769        assert_eq!(scanned.is_text, Some(true));
770        assert_eq!(scanned.is_media, Some(false));
771        assert_eq!(scanned.is_script, Some(false));
772        assert_eq!(scanned.is_source, Some(true));
773    }
774
775    #[test]
776    fn scanner_treats_empty_files_like_scancode_info_surface() {
777        let options = TextDetectionOptions {
778            collect_info: true,
779            detect_packages: false,
780            detect_application_packages: false,
781            detect_system_packages: false,
782            detect_packages_in_compiled: false,
783            detect_copyrights: false,
784            detect_generated: false,
785            detect_emails: false,
786            detect_urls: false,
787            max_emails: 50,
788            max_urls: 50,
789            timeout_seconds: 120.0,
790        };
791        let scanned = scan_single_file("test.txt", "", &options);
792
793        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
794        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
795        assert_eq!(scanned.programming_language, None);
796        assert_eq!(scanned.is_binary, Some(false));
797        assert_eq!(scanned.is_text, Some(true));
798        assert_eq!(scanned.is_archive, Some(false));
799        assert_eq!(scanned.is_media, Some(false));
800        assert_eq!(scanned.is_source, Some(false));
801        assert_eq!(scanned.is_script, Some(false));
802    }
803
804    #[test]
805    fn scanner_treats_package_json_as_text_not_source() {
806        let options = TextDetectionOptions {
807            collect_info: true,
808            detect_packages: false,
809            detect_application_packages: false,
810            detect_system_packages: false,
811            detect_packages_in_compiled: false,
812            detect_copyrights: false,
813            detect_generated: false,
814            detect_emails: false,
815            detect_urls: false,
816            max_emails: 50,
817            max_urls: 50,
818            timeout_seconds: 120.0,
819        };
820        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
821
822        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
823        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
824        assert_eq!(scanned.programming_language, None);
825        assert_eq!(scanned.is_text, Some(true));
826        assert_eq!(scanned.is_source, Some(false));
827        assert_eq!(scanned.is_script, Some(false));
828    }
829
830    #[test]
831    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
832        let options = TextDetectionOptions {
833            collect_info: true,
834            detect_packages: false,
835            detect_application_packages: false,
836            detect_system_packages: false,
837            detect_packages_in_compiled: false,
838            detect_copyrights: false,
839            detect_generated: false,
840            detect_emails: false,
841            detect_urls: false,
842            max_emails: 50,
843            max_urls: 50,
844            timeout_seconds: 120.0,
845        };
846
847        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
848        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
849
850        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
851        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
852        assert_eq!(gradle.is_source, Some(true));
853        assert_eq!(gradle.is_script, Some(false));
854
855        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
856        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
857        assert_eq!(nix.is_source, Some(true));
858        assert_eq!(nix.is_script, Some(false));
859    }
860
861    #[test]
862    fn scanner_treats_gitmodules_as_text_not_source() {
863        let options = TextDetectionOptions {
864            collect_info: true,
865            detect_packages: false,
866            detect_application_packages: false,
867            detect_system_packages: false,
868            detect_packages_in_compiled: false,
869            detect_copyrights: false,
870            detect_generated: false,
871            detect_emails: false,
872            detect_urls: false,
873            max_emails: 50,
874            max_urls: 50,
875            timeout_seconds: 120.0,
876        };
877        let scanned = scan_file_at_relative_path(
878            ".gitmodules",
879            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
880            &options,
881        );
882
883        assert_eq!(scanned.programming_language, None);
884        assert_eq!(
885            scanned.file_type_label.as_deref(),
886            Some("Git configuration text")
887        );
888        assert_eq!(scanned.is_text, Some(true));
889        assert_eq!(scanned.is_source, Some(false));
890        assert_eq!(scanned.is_script, Some(false));
891    }
892
893    #[test]
894    fn scanner_treats_javascript_shebang_files_as_scripts() {
895        let options = TextDetectionOptions {
896            collect_info: true,
897            detect_packages: false,
898            detect_application_packages: false,
899            detect_system_packages: false,
900            detect_packages_in_compiled: false,
901            detect_copyrights: false,
902            detect_generated: false,
903            detect_emails: false,
904            detect_urls: false,
905            max_emails: 50,
906            max_urls: 50,
907            timeout_seconds: 120.0,
908        };
909        let scanned = scan_file_at_relative_path(
910            "bin/run",
911            b"#!/usr/bin/env node\nconsole.log('hello');\n",
912            &options,
913        );
914
915        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
916        assert_eq!(
917            scanned.file_type_label.as_deref(),
918            Some("javascript script, UTF-8 Unicode text executable")
919        );
920        assert_eq!(scanned.is_script, Some(true));
921        assert_eq!(scanned.is_source, Some(true));
922    }
923
924    #[test]
925    fn scanner_treats_dockerfile_as_source() {
926        let options = TextDetectionOptions {
927            collect_info: true,
928            detect_packages: false,
929            detect_application_packages: false,
930            detect_system_packages: false,
931            detect_packages_in_compiled: false,
932            detect_copyrights: false,
933            detect_generated: false,
934            detect_emails: false,
935            detect_urls: false,
936            max_emails: 50,
937            max_urls: 50,
938            timeout_seconds: 120.0,
939        };
940        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
941
942        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
943        assert_eq!(
944            scanned.file_type_label.as_deref(),
945            Some("UTF-8 Unicode text")
946        );
947        assert_eq!(scanned.is_source, Some(true));
948        assert_eq!(scanned.is_script, Some(false));
949    }
950
951    #[test]
952    fn scanner_treats_makefile_as_text_not_source() {
953        let options = TextDetectionOptions {
954            collect_info: true,
955            detect_packages: false,
956            detect_application_packages: false,
957            detect_system_packages: false,
958            detect_packages_in_compiled: false,
959            detect_copyrights: false,
960            detect_generated: false,
961            detect_emails: false,
962            detect_urls: false,
963            max_emails: 50,
964            max_urls: 50,
965            timeout_seconds: 120.0,
966        };
967        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
968
969        assert_eq!(scanned.programming_language, None);
970        assert_eq!(
971            scanned.file_type_label.as_deref(),
972            Some("UTF-8 Unicode text")
973        );
974        assert_eq!(scanned.is_text, Some(true));
975        assert_eq!(scanned.is_source, Some(false));
976        assert_eq!(scanned.is_script, Some(false));
977    }
978
979    #[test]
980    fn scanner_omits_info_surface_when_disabled() {
981        let options = TextDetectionOptions {
982            collect_info: false,
983            detect_packages: false,
984            detect_application_packages: false,
985            detect_system_packages: false,
986            detect_packages_in_compiled: false,
987            detect_copyrights: false,
988            detect_generated: false,
989            detect_emails: false,
990            detect_urls: false,
991            max_emails: 50,
992            max_urls: 50,
993            timeout_seconds: 120.0,
994        };
995        let scanned = scan_single_file(
996            "script.py",
997            "#!/usr/bin/env python3\nprint(\"hello\")\n",
998            &options,
999        );
1000
1001        assert!(scanned.sha1.is_none());
1002        assert!(scanned.md5.is_none());
1003        assert!(scanned.sha256.is_none());
1004        assert!(scanned.sha1_git.is_none());
1005        assert!(scanned.mime_type.is_none());
1006        assert!(scanned.date.is_none());
1007        assert!(scanned.programming_language.is_none());
1008        assert!(scanned.is_binary.is_none());
1009        assert!(scanned.is_text.is_none());
1010        assert!(scanned.is_archive.is_none());
1011        assert!(scanned.is_media.is_none());
1012        assert!(scanned.is_script.is_none());
1013        assert!(scanned.is_source.is_none());
1014    }
1015
1016    #[test]
1017    fn scanner_skips_package_parsing_when_disabled() {
1018        let options = TextDetectionOptions {
1019            collect_info: false,
1020            detect_packages: false,
1021            detect_application_packages: false,
1022            detect_system_packages: false,
1023            detect_packages_in_compiled: false,
1024            detect_copyrights: false,
1025            detect_generated: false,
1026            detect_emails: false,
1027            detect_urls: false,
1028            max_emails: 50,
1029            max_urls: 50,
1030            timeout_seconds: 120.0,
1031        };
1032        let scanned = scan_single_file(
1033            "package.json",
1034            r#"{"name":"demo","version":"1.0.0"}"#,
1035            &options,
1036        );
1037
1038        assert!(
1039            scanned.package_data.is_empty(),
1040            "package_data: {:#?}",
1041            scanned.package_data
1042        );
1043    }
1044
1045    #[test]
1046    fn scanner_parses_package_manifests_when_enabled() {
1047        let options = TextDetectionOptions {
1048            collect_info: false,
1049            detect_packages: true,
1050            detect_application_packages: true,
1051            detect_system_packages: false,
1052            detect_packages_in_compiled: false,
1053            detect_copyrights: false,
1054            detect_generated: false,
1055            detect_emails: false,
1056            detect_urls: false,
1057            max_emails: 50,
1058            max_urls: 50,
1059            timeout_seconds: 120.0,
1060        };
1061        let scanned = scan_single_file(
1062            "package.json",
1063            r#"{"name":"demo","version":"1.0.0"}"#,
1064            &options,
1065        );
1066
1067        assert_eq!(
1068            scanned.package_data.len(),
1069            1,
1070            "package_data: {:#?}",
1071            scanned.package_data
1072        );
1073    }
1074
1075    #[test]
1076    fn scanner_parses_oversized_rpm_in_package_only_mode_without_size_warning() {
1077        let temp_dir = TempDir::new().expect("create temp dir");
1078        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-demo");
1079
1080        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1081        let collected = collect_paths(temp_dir.path(), 0, &[]);
1082        let result = process_collected(
1083            &collected,
1084            progress,
1085            None,
1086            LicenseScanOptions::default(),
1087            &TextDetectionOptions {
1088                collect_info: false,
1089                detect_packages: true,
1090                detect_application_packages: true,
1091                detect_system_packages: false,
1092                detect_packages_in_compiled: false,
1093                detect_copyrights: false,
1094                detect_generated: false,
1095                detect_emails: false,
1096                detect_urls: false,
1097                max_emails: 50,
1098                max_urls: 50,
1099                timeout_seconds: 120.0,
1100            },
1101        );
1102
1103        let scanned = result
1104            .files
1105            .into_iter()
1106            .find(|entry| {
1107                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1108            })
1109            .expect("scanned file entry");
1110
1111        assert!(
1112            scanned.scan_errors.is_empty(),
1113            "scan_errors: {:#?}",
1114            scanned.scan_errors
1115        );
1116        assert_eq!(
1117            scanned.package_data.len(),
1118            1,
1119            "package_data: {:#?}",
1120            scanned.package_data
1121        );
1122        assert_eq!(
1123            scanned.package_data[0].datasource_id,
1124            Some(DatasourceId::RpmArchive)
1125        );
1126        assert_eq!(
1127            scanned.package_data[0].name.as_deref(),
1128            Some("oversized-demo")
1129        );
1130        assert_eq!(scanned.package_data[0].version.as_deref(), Some("1.0-1"));
1131    }
1132
1133    #[test]
1134    fn scanner_parses_oversized_rpm_with_info_without_timeout_or_size_warning() {
1135        let temp_dir = TempDir::new().expect("create temp dir");
1136        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-info-demo");
1137
1138        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1139        let collected = collect_paths(temp_dir.path(), 0, &[]);
1140        let result = process_collected(
1141            &collected,
1142            progress,
1143            None,
1144            LicenseScanOptions::default(),
1145            &TextDetectionOptions {
1146                collect_info: true,
1147                detect_packages: true,
1148                detect_application_packages: true,
1149                detect_system_packages: false,
1150                detect_packages_in_compiled: false,
1151                detect_copyrights: false,
1152                detect_generated: false,
1153                detect_emails: false,
1154                detect_urls: false,
1155                max_emails: 50,
1156                max_urls: 50,
1157                timeout_seconds: 120.0,
1158            },
1159        );
1160
1161        let scanned = result
1162            .files
1163            .into_iter()
1164            .find(|entry| {
1165                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1166            })
1167            .expect("scanned file entry");
1168
1169        assert!(
1170            scanned.scan_errors.is_empty(),
1171            "scan_errors: {:#?}",
1172            scanned.scan_errors
1173        );
1174        assert_eq!(
1175            scanned.package_data.len(),
1176            1,
1177            "package_data: {:#?}",
1178            scanned.package_data
1179        );
1180        assert_eq!(
1181            scanned.package_data[0].datasource_id,
1182            Some(DatasourceId::RpmArchive)
1183        );
1184        assert_eq!(
1185            scanned.package_data[0].name.as_deref(),
1186            Some("oversized-info-demo")
1187        );
1188        assert!(scanned.sha1.is_some());
1189        assert!(scanned.md5.is_some());
1190        assert!(scanned.sha256.is_some());
1191        assert!(scanned.sha1_git.is_some());
1192        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1193        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1194        assert_eq!(scanned.is_binary, Some(true));
1195        assert_eq!(scanned.is_text, Some(false));
1196        assert_eq!(scanned.is_archive, Some(true));
1197    }
1198
1199    #[test]
1200    fn scanner_parses_oversized_pack_rpm_in_package_only_mode_without_size_warning() {
1201        let temp_dir = TempDir::new().expect("create temp dir");
1202        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-demo");
1203
1204        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1205        let collected = collect_paths(temp_dir.path(), 0, &[]);
1206        let result = process_collected(
1207            &collected,
1208            progress,
1209            None,
1210            LicenseScanOptions::default(),
1211            &TextDetectionOptions {
1212                collect_info: false,
1213                detect_packages: true,
1214                detect_application_packages: true,
1215                detect_system_packages: false,
1216                detect_packages_in_compiled: false,
1217                detect_copyrights: false,
1218                detect_generated: false,
1219                detect_emails: false,
1220                detect_urls: false,
1221                max_emails: 50,
1222                max_urls: 50,
1223                timeout_seconds: 120.0,
1224            },
1225        );
1226
1227        let scanned = result
1228            .files
1229            .into_iter()
1230            .find(|entry| {
1231                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1232            })
1233            .expect("scanned file entry");
1234
1235        assert!(
1236            scanned.scan_errors.is_empty(),
1237            "scan_errors: {:#?}",
1238            scanned.scan_errors
1239        );
1240        assert_eq!(
1241            scanned.package_data.len(),
1242            1,
1243            "package_data: {:#?}",
1244            scanned.package_data
1245        );
1246        assert_eq!(
1247            scanned.package_data[0].datasource_id,
1248            Some(DatasourceId::RpmArchive)
1249        );
1250        assert_eq!(
1251            scanned.package_data[0].name.as_deref(),
1252            Some("oversized-pack-demo")
1253        );
1254    }
1255
1256    #[test]
1257    fn scanner_parses_oversized_pack_rpm_with_info_without_timeout_or_size_warning() {
1258        let temp_dir = TempDir::new().expect("create temp dir");
1259        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-info-demo");
1260
1261        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1262        let collected = collect_paths(temp_dir.path(), 0, &[]);
1263        let result = process_collected(
1264            &collected,
1265            progress,
1266            None,
1267            LicenseScanOptions::default(),
1268            &TextDetectionOptions {
1269                collect_info: true,
1270                detect_packages: true,
1271                detect_application_packages: true,
1272                detect_system_packages: false,
1273                detect_packages_in_compiled: false,
1274                detect_copyrights: false,
1275                detect_generated: false,
1276                detect_emails: false,
1277                detect_urls: false,
1278                max_emails: 50,
1279                max_urls: 50,
1280                timeout_seconds: 120.0,
1281            },
1282        );
1283
1284        let scanned = result
1285            .files
1286            .into_iter()
1287            .find(|entry| {
1288                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1289            })
1290            .expect("scanned file entry");
1291
1292        assert!(
1293            scanned.scan_errors.is_empty(),
1294            "scan_errors: {:#?}",
1295            scanned.scan_errors
1296        );
1297        assert_eq!(
1298            scanned.package_data.len(),
1299            1,
1300            "package_data: {:#?}",
1301            scanned.package_data
1302        );
1303        assert_eq!(
1304            scanned.package_data[0].datasource_id,
1305            Some(DatasourceId::RpmArchive)
1306        );
1307        assert_eq!(
1308            scanned.package_data[0].name.as_deref(),
1309            Some("oversized-pack-info-demo")
1310        );
1311        assert!(scanned.sha1.is_some());
1312        assert!(scanned.md5.is_some());
1313        assert!(scanned.sha256.is_some());
1314        assert!(scanned.sha1_git.is_some());
1315        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1316        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1317        assert_eq!(scanned.is_binary, Some(true));
1318        assert_eq!(scanned.is_text, Some(false));
1319        assert_eq!(scanned.is_archive, Some(true));
1320    }
1321
1322    #[test]
1323    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1324        let options = TextDetectionOptions {
1325            collect_info: false,
1326            detect_packages: true,
1327            detect_application_packages: false,
1328            detect_system_packages: true,
1329            detect_packages_in_compiled: false,
1330            detect_copyrights: false,
1331            detect_generated: false,
1332            detect_emails: false,
1333            detect_urls: false,
1334            max_emails: 50,
1335            max_urls: 50,
1336            timeout_seconds: 120.0,
1337        };
1338        let scanned = scan_single_file(
1339            "package.json",
1340            r#"{"name":"demo","version":"1.0.0"}"#,
1341            &options,
1342        );
1343
1344        assert!(
1345            scanned.package_data.is_empty(),
1346            "package_data: {:#?}",
1347            scanned.package_data
1348        );
1349    }
1350
1351    #[test]
1352    fn scanner_parses_system_package_files_when_enabled() {
1353        let options = TextDetectionOptions {
1354            collect_info: false,
1355            detect_packages: true,
1356            detect_application_packages: false,
1357            detect_system_packages: true,
1358            detect_packages_in_compiled: false,
1359            detect_copyrights: false,
1360            detect_generated: false,
1361            detect_emails: false,
1362            detect_urls: false,
1363            max_emails: 50,
1364            max_urls: 50,
1365            timeout_seconds: 120.0,
1366        };
1367        let scanned = scan_file_at_relative_path(
1368            "var/lib/dpkg/status",
1369            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1370            &options,
1371        );
1372
1373        assert!(
1374            !scanned.package_data.is_empty(),
1375            "package_data: {:#?}",
1376            scanned.package_data
1377        );
1378    }
1379
1380    #[test]
1381    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1382        if std::process::Command::new("go")
1383            .arg("version")
1384            .status()
1385            .is_err()
1386        {
1387            return;
1388        }
1389
1390        let temp_dir = TempDir::new().expect("create temp dir");
1391        fs::write(
1392            temp_dir.path().join("go.mod"),
1393            "module example.com/demo\n\ngo 1.23.0\n",
1394        )
1395        .expect("write go.mod");
1396        fs::write(
1397            temp_dir.path().join("main.go"),
1398            "package main\nfunc main() {}\n",
1399        )
1400        .expect("write main.go");
1401        let file_path = temp_dir.path().join("demo");
1402        let status = std::process::Command::new("go")
1403            .current_dir(temp_dir.path())
1404            .args(["build", "-o"])
1405            .arg(&file_path)
1406            .status()
1407            .expect("run go build");
1408        assert!(status.success());
1409
1410        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1411        let collected = collect_paths(temp_dir.path(), 0, &[]);
1412
1413        let without_compiled = process_collected(
1414            &collected,
1415            Arc::clone(&progress),
1416            None,
1417            LicenseScanOptions::default(),
1418            &TextDetectionOptions {
1419                collect_info: false,
1420                detect_packages: true,
1421                detect_application_packages: true,
1422                detect_system_packages: false,
1423                detect_packages_in_compiled: false,
1424                detect_copyrights: false,
1425                detect_generated: false,
1426                detect_emails: false,
1427                detect_urls: false,
1428                max_emails: 50,
1429                max_urls: 50,
1430                timeout_seconds: 120.0,
1431            },
1432        );
1433        let with_compiled = process_collected(
1434            &collected,
1435            progress,
1436            None,
1437            LicenseScanOptions::default(),
1438            &TextDetectionOptions {
1439                collect_info: false,
1440                detect_packages: true,
1441                detect_application_packages: true,
1442                detect_system_packages: false,
1443                detect_packages_in_compiled: true,
1444                detect_copyrights: false,
1445                detect_generated: false,
1446                detect_emails: false,
1447                detect_urls: false,
1448                max_emails: 50,
1449                max_urls: 50,
1450                timeout_seconds: 120.0,
1451            },
1452        );
1453
1454        let without_compiled = without_compiled
1455            .files
1456            .into_iter()
1457            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1458            .expect("compiled artifact present");
1459        let with_compiled = with_compiled
1460            .files
1461            .into_iter()
1462            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1463            .expect("compiled artifact present");
1464
1465        assert!(
1466            without_compiled.package_data.is_empty(),
1467            "package_data: {:#?}",
1468            without_compiled.package_data
1469        );
1470        assert!(!with_compiled.package_data.is_empty());
1471    }
1472
1473    #[test]
1474    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1475        let temp_dir = TempDir::new().expect("create temp dir");
1476        let file_path = temp_dir.path().join("libiconv2.dll");
1477        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1478            .expect("read PE fixture");
1479        fs::write(&file_path, fixture).expect("write PE fixture");
1480
1481        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1482        let collected = collect_paths(temp_dir.path(), 0, &[]);
1483
1484        let without_package = process_collected(
1485            &collected,
1486            Arc::clone(&progress),
1487            None,
1488            LicenseScanOptions::default(),
1489            &TextDetectionOptions {
1490                collect_info: false,
1491                detect_packages: false,
1492                detect_application_packages: false,
1493                detect_system_packages: false,
1494                detect_packages_in_compiled: false,
1495                detect_copyrights: false,
1496                detect_generated: false,
1497                detect_emails: false,
1498                detect_urls: false,
1499                max_emails: 50,
1500                max_urls: 50,
1501                timeout_seconds: 120.0,
1502            },
1503        );
1504        let with_package = process_collected(
1505            &collected,
1506            progress,
1507            None,
1508            LicenseScanOptions::default(),
1509            &TextDetectionOptions {
1510                collect_info: false,
1511                detect_packages: true,
1512                detect_application_packages: true,
1513                detect_system_packages: false,
1514                detect_packages_in_compiled: false,
1515                detect_copyrights: false,
1516                detect_generated: false,
1517                detect_emails: false,
1518                detect_urls: false,
1519                max_emails: 50,
1520                max_urls: 50,
1521                timeout_seconds: 120.0,
1522            },
1523        );
1524
1525        let without_package = without_package
1526            .files
1527            .into_iter()
1528            .find(|entry| {
1529                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1530            })
1531            .expect("compiled artifact present");
1532        let with_package = with_package
1533            .files
1534            .into_iter()
1535            .find(|entry| {
1536                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1537            })
1538            .expect("compiled artifact present");
1539
1540        assert!(without_package.package_data.is_empty());
1541        assert_eq!(with_package.package_data.len(), 1);
1542        assert_eq!(
1543            with_package.package_data[0].package_type,
1544            Some(FilePackageType::Winexe)
1545        );
1546        assert_eq!(
1547            with_package.package_data[0].datasource_id,
1548            Some(DatasourceId::WindowsExecutable)
1549        );
1550    }
1551
1552    #[test]
1553    fn scanner_keeps_nsis_and_windows_executable_package_data_together() {
1554        let temp_dir = TempDir::new().expect("create temp dir");
1555        let file_path = temp_dir.path().join("nsis-with-version.exe");
1556        let mut fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1557            .expect("read PE fixture");
1558        if fixture.len() < 70_000 {
1559            fixture.resize(70_000, 0);
1560        }
1561        fixture.extend_from_slice(b"Nullsoft.NSIS.exehead");
1562        fs::write(&file_path, fixture).expect("write synthetic NSIS PE fixture");
1563
1564        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1565        let collected = collect_paths(temp_dir.path(), 0, &[]);
1566        let result = process_collected(
1567            &collected,
1568            progress,
1569            None,
1570            LicenseScanOptions::default(),
1571            &TextDetectionOptions {
1572                collect_info: false,
1573                detect_packages: true,
1574                detect_application_packages: true,
1575                detect_system_packages: false,
1576                detect_packages_in_compiled: false,
1577                detect_copyrights: false,
1578                detect_generated: false,
1579                detect_emails: false,
1580                detect_urls: false,
1581                max_emails: 50,
1582                max_urls: 50,
1583                timeout_seconds: 120.0,
1584            },
1585        );
1586
1587        let scanned = result
1588            .files
1589            .into_iter()
1590            .find(|entry| {
1591                entry.file_type == FileType::File && entry.path.ends_with("/nsis-with-version.exe")
1592            })
1593            .expect("compiled artifact present");
1594
1595        assert_eq!(
1596            scanned.package_data.len(),
1597            2,
1598            "package_data: {:#?}",
1599            scanned.package_data
1600        );
1601        assert!(
1602            scanned
1603                .package_data
1604                .iter()
1605                .any(|pkg| pkg.datasource_id == Some(DatasourceId::NsisInstaller))
1606        );
1607        assert!(
1608            scanned
1609                .package_data
1610                .iter()
1611                .any(|pkg| pkg.datasource_id == Some(DatasourceId::WindowsExecutable))
1612        );
1613    }
1614
1615    #[test]
1616    fn scanner_detects_license_from_font_metadata() {
1617        let temp_dir = TempDir::new().expect("create temp dir");
1618        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1619        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1620        fs::write(&file_path, fixture).expect("write font fixture");
1621
1622        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1623        let collected = collect_paths(temp_dir.path(), 0, &[]);
1624        let engine =
1625            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1626        let result = process_collected(
1627            &collected,
1628            progress,
1629            Some(engine),
1630            LicenseScanOptions::default(),
1631            &TextDetectionOptions::default(),
1632        );
1633        let scanned = result
1634            .files
1635            .into_iter()
1636            .find(|entry| {
1637                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1638            })
1639            .expect("scanned file entry");
1640
1641        assert!(
1642            scanned.license_expression.is_some(),
1643            "license detections: {:#?}",
1644            scanned.license_detections
1645        );
1646        assert!(
1647            scanned
1648                .license_expression
1649                .as_deref()
1650                .is_some_and(
1651                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1652                ),
1653            "license expression: {:?}",
1654            scanned.license_expression
1655        );
1656    }
1657
1658    #[test]
1659    fn scanner_detects_license_from_windows_executable_metadata() {
1660        let temp_dir = TempDir::new().expect("create temp dir");
1661        let file_path = temp_dir.path().join("libiconv2.dll");
1662        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1663            .expect("read PE fixture");
1664        fs::write(&file_path, fixture).expect("write PE fixture");
1665
1666        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1667        let collected = collect_paths(temp_dir.path(), 0, &[]);
1668        let engine =
1669            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1670        let result = process_collected(
1671            &collected,
1672            progress,
1673            Some(engine),
1674            LicenseScanOptions::default(),
1675            &TextDetectionOptions::default(),
1676        );
1677        let scanned = result
1678            .files
1679            .into_iter()
1680            .find(|entry| {
1681                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1682            })
1683            .expect("scanned file entry");
1684
1685        assert!(
1686            scanned.license_expression.is_some(),
1687            "license detections: {:#?}",
1688            scanned.license_detections
1689        );
1690        assert!(
1691            scanned
1692                .license_expression
1693                .as_deref()
1694                .is_some_and(|expression| {
1695                    expression.contains("lgpl") || expression.contains("LGPL")
1696                }),
1697            "license expression: {:?}",
1698            scanned.license_expression
1699        );
1700    }
1701
1702    #[test]
1703    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1704        let scanned = scan_single_file_with_license_engine(
1705            "navbar.md",
1706            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1707            &TextDetectionOptions::default(),
1708        );
1709
1710        assert!(
1711            scanned
1712                .license_expression
1713                .as_deref()
1714                .is_some_and(|expression| {
1715                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1716                }),
1717            "license expression: {:?}",
1718            scanned.license_expression
1719        );
1720    }
1721
1722    #[test]
1723    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1724        let scanned = scan_single_file_with_license_engine(
1725            "README.md",
1726            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1727            &TextDetectionOptions::default(),
1728        );
1729
1730        assert!(
1731            scanned
1732                .license_expression
1733                .as_deref()
1734                .is_some_and(|expression| {
1735                    expression.contains("mit") || expression.contains("MIT")
1736                }),
1737            "license expression: {:?}",
1738            scanned.license_expression
1739        );
1740    }
1741
1742    #[test]
1743    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1744        let scanned = scan_single_file_with_license_engine(
1745            "README.md",
1746            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1747            &TextDetectionOptions::default(),
1748        );
1749
1750        assert!(
1751            scanned
1752                .license_expression
1753                .as_deref()
1754                .is_some_and(|expression| {
1755                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1756                }),
1757            "license expression: {:?}",
1758            scanned.license_expression
1759        );
1760    }
1761
1762    #[test]
1763    fn scanner_sets_is_source_only_when_info_enabled() {
1764        let without_info = TextDetectionOptions {
1765            collect_info: false,
1766            detect_packages: false,
1767            detect_application_packages: false,
1768            detect_system_packages: false,
1769            detect_packages_in_compiled: false,
1770            detect_copyrights: false,
1771            detect_generated: false,
1772            detect_emails: false,
1773            detect_urls: false,
1774            max_emails: 50,
1775            max_urls: 50,
1776            timeout_seconds: 120.0,
1777        };
1778        let with_info = TextDetectionOptions {
1779            collect_info: true,
1780            ..without_info.clone()
1781        };
1782
1783        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1784        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1785
1786        assert_eq!(scanned_without_info.is_source, None);
1787        assert_eq!(scanned_with_info.is_source, Some(true));
1788    }
1789
1790    #[test]
1791    fn directory_omits_info_fields_when_info_disabled() {
1792        let temp_dir = TempDir::new().expect("create temp dir");
1793        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1794
1795        let collected = collect_paths(temp_dir.path(), 0, &[]);
1796        let result = process_collected(
1797            &collected,
1798            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1799            None,
1800            LicenseScanOptions::default(),
1801            &TextDetectionOptions {
1802                collect_info: false,
1803                detect_packages: false,
1804                detect_application_packages: false,
1805                detect_system_packages: false,
1806                detect_packages_in_compiled: false,
1807                detect_copyrights: false,
1808                detect_generated: false,
1809                detect_emails: false,
1810                detect_urls: false,
1811                max_emails: 50,
1812                max_urls: 50,
1813                timeout_seconds: 120.0,
1814            },
1815        );
1816
1817        let directory = result
1818            .files
1819            .into_iter()
1820            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1821            .expect("directory entry");
1822
1823        assert!(directory.date.is_none());
1824        assert!(directory.file_type_label.is_none());
1825        assert!(directory.is_binary.is_none());
1826        assert!(directory.is_text.is_none());
1827        assert!(directory.is_archive.is_none());
1828        assert!(directory.is_media.is_none());
1829        assert!(directory.is_source.is_none());
1830        assert!(directory.is_script.is_none());
1831    }
1832
1833    #[test]
1834    fn directory_includes_info_fields_when_info_enabled() {
1835        let temp_dir = TempDir::new().expect("create temp dir");
1836        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1837
1838        let collected = collect_paths(temp_dir.path(), 0, &[]);
1839        let result = process_collected(
1840            &collected,
1841            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1842            None,
1843            LicenseScanOptions::default(),
1844            &TextDetectionOptions {
1845                collect_info: true,
1846                detect_packages: false,
1847                detect_application_packages: false,
1848                detect_system_packages: false,
1849                detect_packages_in_compiled: false,
1850                detect_copyrights: false,
1851                detect_generated: false,
1852                detect_emails: false,
1853                detect_urls: false,
1854                max_emails: 50,
1855                max_urls: 50,
1856                timeout_seconds: 120.0,
1857            },
1858        );
1859
1860        let directory = result
1861            .files
1862            .into_iter()
1863            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1864            .expect("directory entry");
1865
1866        assert!(directory.date.is_none());
1867        assert!(directory.file_type_label.is_none());
1868        assert_eq!(directory.is_binary, Some(false));
1869        assert_eq!(directory.is_text, Some(false));
1870        assert_eq!(directory.is_archive, Some(false));
1871        assert_eq!(directory.is_media, Some(false));
1872        assert_eq!(directory.is_source, Some(false));
1873        assert_eq!(directory.is_script, Some(false));
1874        assert_eq!(directory.files_count, Some(0));
1875        assert_eq!(directory.dirs_count, Some(0));
1876        assert_eq!(directory.size_count, Some(0));
1877    }
1878
1879    #[test]
1880    fn collect_paths_includes_root_directory_entry() {
1881        let temp_dir = TempDir::new().expect("create temp dir");
1882        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1883        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1884            .expect("write nested file");
1885
1886        let collected = collect_paths(temp_dir.path(), 0, &[]);
1887
1888        assert!(
1889            collected
1890                .directories
1891                .iter()
1892                .any(|(path, _)| path == temp_dir.path())
1893        );
1894    }
1895
1896    #[test]
1897    fn collect_paths_supports_single_file_input() {
1898        let temp_dir = TempDir::new().expect("create temp dir");
1899        let file_path = temp_dir.path().join("main.rs");
1900        fs::write(&file_path, "fn main() {}\n").expect("write file");
1901
1902        let collected = collect_paths(&file_path, 0, &[]);
1903
1904        assert_eq!(collected.files.len(), 1);
1905        assert!(collected.directories.is_empty());
1906        assert_eq!(collected.files[0].0, file_path);
1907    }
1908
1909    #[cfg(unix)]
1910    #[test]
1911    fn collect_selected_paths_does_not_walk_unselected_siblings() {
1912        use std::os::unix::fs::PermissionsExt;
1913
1914        let temp_dir = TempDir::new().expect("create temp dir");
1915        let root = temp_dir.path();
1916        fs::create_dir_all(root.join("selected/docs")).expect("create selected dir");
1917        fs::create_dir_all(root.join("blocked/secret")).expect("create blocked dir");
1918        fs::write(root.join("selected/docs/guide.md"), "# guide\n").expect("write guide");
1919
1920        let blocked = root.join("blocked");
1921        let mut perms = fs::metadata(&blocked)
1922            .expect("blocked metadata")
1923            .permissions();
1924        perms.set_mode(0o000);
1925        fs::set_permissions(&blocked, perms).expect("remove blocked permissions");
1926
1927        let collected = collect_selected_paths(
1928            root,
1929            &[CollectionFrontier {
1930                path: PathBuf::from("selected"),
1931                recurse: true,
1932            }],
1933            0,
1934            &[],
1935        );
1936
1937        let mut restore = fs::metadata(&blocked)
1938            .expect("blocked metadata")
1939            .permissions();
1940        restore.set_mode(0o755);
1941        fs::set_permissions(&blocked, restore).expect("restore blocked permissions");
1942
1943        assert!(
1944            collected.collection_errors.is_empty(),
1945            "{:#?}",
1946            collected.collection_errors
1947        );
1948        assert!(
1949            collected
1950                .files
1951                .iter()
1952                .any(|(path, _)| path == &root.join("selected/docs/guide.md"))
1953        );
1954        assert!(
1955            collected
1956                .files
1957                .iter()
1958                .all(|(path, _): &(PathBuf, fs::Metadata)| !path.starts_with(&blocked))
1959        );
1960    }
1961
1962    #[test]
1963    fn collect_selected_paths_respects_excluded_ancestor_directories() {
1964        let temp_dir = TempDir::new().expect("create temp dir");
1965        let root = temp_dir.path();
1966        fs::create_dir_all(root.join(".git")).expect("create git dir");
1967        fs::write(
1968            root.join(".git/config"),
1969            "[core]\nrepositoryformatversion = 0\n",
1970        )
1971        .expect("write git config");
1972
1973        let exclude_patterns =
1974            build_collection_exclude_patterns(root, &root.join(".provenant-cache"));
1975        let collected = collect_selected_paths(
1976            root,
1977            &[CollectionFrontier {
1978                path: PathBuf::from(".git/config"),
1979                recurse: false,
1980            }],
1981            0,
1982            &exclude_patterns,
1983        );
1984
1985        assert!(collected.files.is_empty());
1986        assert!(collected.directories.iter().all(|(path, _)| path == root));
1987        assert_eq!(collected.excluded_count, 1);
1988    }
1989
1990    #[test]
1991    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1992        let temp_dir = TempDir::new().expect("create temp dir");
1993        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1994        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1995
1996        let collected = collect_paths(temp_dir.path(), 0, &[]);
1997        let result = process_collected_with_memory_limit(
1998            &collected,
1999            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2000            None,
2001            LicenseScanOptions::default(),
2002            &TextDetectionOptions {
2003                collect_info: false,
2004                detect_packages: false,
2005                detect_application_packages: false,
2006                detect_system_packages: false,
2007                detect_packages_in_compiled: false,
2008                detect_copyrights: false,
2009                detect_generated: false,
2010                detect_emails: false,
2011                detect_urls: false,
2012                max_emails: 50,
2013                max_urls: 50,
2014                timeout_seconds: 120.0,
2015            },
2016            MemoryMode::Limit(1),
2017        );
2018
2019        assert_eq!(result.files.len(), 3);
2020    }
2021
2022    #[test]
2023    fn process_collected_with_negative_one_uses_disk_only_mode() {
2024        let temp_dir = TempDir::new().expect("create temp dir");
2025        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2026
2027        let collected = collect_paths(temp_dir.path(), 0, &[]);
2028        let result = process_collected_with_memory_limit(
2029            &collected,
2030            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2031            None,
2032            LicenseScanOptions::default(),
2033            &TextDetectionOptions {
2034                collect_info: false,
2035                detect_packages: false,
2036                detect_application_packages: false,
2037                detect_system_packages: false,
2038                detect_packages_in_compiled: false,
2039                detect_copyrights: false,
2040                detect_generated: false,
2041                detect_emails: false,
2042                detect_urls: false,
2043                max_emails: 50,
2044                max_urls: 50,
2045                timeout_seconds: 120.0,
2046            },
2047            MemoryMode::StreamUnlimited,
2048        );
2049
2050        assert_eq!(result.files.len(), 2);
2051    }
2052}