Skip to main content

provenant/scanner/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod collect;
5mod process;
6
7use crate::license_detection::LicenseDetectionEngine;
8use crate::models::FileInfo;
9
10pub struct ProcessResult {
11    pub files: Vec<FileInfo>,
12    pub excluded_count: usize,
13}
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct LicenseScanOptions {
17    pub include_text: bool,
18    pub include_text_diagnostics: bool,
19    pub include_diagnostics: bool,
20    pub unknown_licenses: bool,
21    pub min_score: u8,
22}
23
24#[derive(Debug, Clone)]
25pub struct TextDetectionOptions {
26    pub collect_info: bool,
27    pub detect_packages: bool,
28    pub detect_application_packages: bool,
29    pub detect_system_packages: bool,
30    pub detect_packages_in_compiled: bool,
31    pub detect_copyrights: bool,
32    pub detect_generated: bool,
33    pub detect_emails: bool,
34    pub detect_urls: bool,
35    pub max_emails: usize,
36    pub max_urls: usize,
37    pub timeout_seconds: f64,
38}
39
40impl Default for TextDetectionOptions {
41    fn default() -> Self {
42        Self {
43            collect_info: false,
44            detect_packages: false,
45            detect_application_packages: false,
46            detect_system_packages: false,
47            detect_packages_in_compiled: false,
48            detect_copyrights: true,
49            detect_generated: false,
50            detect_emails: false,
51            detect_urls: false,
52            max_emails: 50,
53            max_urls: 50,
54            timeout_seconds: 120.0,
55        }
56    }
57}
58
59pub fn scan_options_fingerprint(
60    text_options: &TextDetectionOptions,
61    license_options: LicenseScanOptions,
62    license_engine: Option<&LicenseDetectionEngine>,
63) -> String {
64    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
65        Some(engine) => {
66            let rules = &engine.index().rules_by_rid;
67            (
68                true,
69                rules.len(),
70                rules
71                    .first()
72                    .map(|rule| rule.identifier.as_str())
73                    .unwrap_or(""),
74                rules
75                    .last()
76                    .map(|rule| rule.identifier.as_str())
77                    .unwrap_or(""),
78            )
79        }
80        None => (false, 0, "", ""),
81    };
82
83    format!(
84        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
85        crate::version::BUILD_VERSION,
86        text_options.collect_info,
87        text_options.detect_packages,
88        text_options.detect_application_packages,
89        text_options.detect_system_packages,
90        text_options.detect_packages_in_compiled,
91        text_options.detect_copyrights,
92        text_options.detect_generated,
93        text_options.detect_emails,
94        text_options.detect_urls,
95        text_options.max_emails,
96        text_options.max_urls,
97        text_options.timeout_seconds,
98        license_enabled,
99        rules_count,
100        first_rule_id,
101        last_rule_id,
102        license_options.include_text,
103        license_options.include_text_diagnostics,
104        license_options.include_diagnostics,
105        license_options.unknown_licenses,
106        license_options.min_score,
107    )
108}
109
110pub use self::collect::{CollectedPaths, collect_paths};
111#[allow(unused_imports)]
112pub use self::process::{
113    MemoryMode, process_collected, process_collected_sequential,
114    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
115};
116
117#[cfg(test)]
118mod tests {
119    use std::fs;
120    use std::path::PathBuf;
121    use std::sync::Arc;
122
123    use tempfile::TempDir;
124
125    use crate::license_detection::LicenseDetectionEngine;
126    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
127    use crate::progress::{ProgressMode, ScanProgress};
128
129    use super::{
130        LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths, process_collected,
131        process_collected_with_memory_limit, scan_options_fingerprint,
132    };
133
134    fn build_sparse_oversized_rpm_with_filename(
135        temp_dir: &TempDir,
136        package_name: &str,
137        filename: &str,
138    ) -> PathBuf {
139        let file_path = temp_dir.path().join(filename);
140        rpm::PackageBuilder::new(package_name, "1.0", "MIT", "x86_64", "Demo RPM package")
141            .release("1")
142            .build()
143            .expect("build rpm fixture")
144            .write_file(&file_path)
145            .expect("write rpm fixture");
146        fs::OpenOptions::new()
147            .write(true)
148            .open(&file_path)
149            .expect("open rpm fixture for sparse extension")
150            .set_len(100 * 1024 * 1024 + 1_048_576)
151            .expect("extend rpm fixture");
152        file_path
153    }
154
155    fn build_sparse_oversized_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
156        build_sparse_oversized_rpm_with_filename(
157            temp_dir,
158            name,
159            &format!("{name}-1.0-1.x86_64.rpm"),
160        )
161    }
162
163    fn build_sparse_oversized_pack_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
164        build_sparse_oversized_rpm_with_filename(
165            temp_dir,
166            name,
167            &format!("{name}-1.0-1.x86_64.pack"),
168        )
169    }
170
171    #[test]
172    fn default_options_keep_copyright_detection_enabled() {
173        let options = TextDetectionOptions::default();
174        assert!(!options.detect_packages);
175        assert!(options.detect_copyrights);
176    }
177
178    #[test]
179    fn test_scan_options_fingerprint_changes_with_license_score() {
180        let text_options = TextDetectionOptions::default();
181        let default_fingerprint = scan_options_fingerprint(
182            &text_options,
183            LicenseScanOptions {
184                min_score: 0,
185                ..LicenseScanOptions::default()
186            },
187            None,
188        );
189        let filtered_fingerprint = scan_options_fingerprint(
190            &text_options,
191            LicenseScanOptions {
192                min_score: 70,
193                ..LicenseScanOptions::default()
194            },
195            None,
196        );
197
198        assert_ne!(default_fingerprint, filtered_fingerprint);
199    }
200
201    fn scan_single_file(
202        file_name: &str,
203        content: &str,
204        options: &TextDetectionOptions,
205    ) -> crate::models::FileInfo {
206        let temp_dir = TempDir::new().expect("create temp dir");
207        let file_path = temp_dir.path().join(file_name);
208        fs::write(&file_path, content).expect("write test file");
209
210        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
211        let collected = collect_paths(temp_dir.path(), 0, &[]);
212        let result = process_collected(
213            &collected,
214            progress,
215            None,
216            LicenseScanOptions::default(),
217            options,
218        );
219
220        result
221            .files
222            .into_iter()
223            .find(|entry| {
224                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
225            })
226            .expect("scanned file entry")
227    }
228
229    fn scan_file_at_relative_path(
230        relative_path: &str,
231        content: &[u8],
232        options: &TextDetectionOptions,
233    ) -> crate::models::FileInfo {
234        let temp_dir = TempDir::new().expect("create temp dir");
235        let file_path = temp_dir.path().join(relative_path);
236        if let Some(parent) = file_path.parent() {
237            fs::create_dir_all(parent).expect("create parent dirs");
238        }
239        fs::write(&file_path, content).expect("write test file");
240
241        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
242        let collected = collect_paths(temp_dir.path(), 0, &[]);
243        let result = process_collected(
244            &collected,
245            progress,
246            None,
247            LicenseScanOptions::default(),
248            options,
249        );
250
251        result
252            .files
253            .into_iter()
254            .find(|entry| {
255                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
256            })
257            .expect("scanned file entry")
258    }
259
260    fn scan_single_file_with_license_engine(
261        file_name: &str,
262        content: &str,
263        options: &TextDetectionOptions,
264    ) -> crate::models::FileInfo {
265        let temp_dir = TempDir::new().expect("create temp dir");
266        let file_path = temp_dir.path().join(file_name);
267        fs::write(&file_path, content).expect("write test file");
268
269        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
270        let collected = collect_paths(temp_dir.path(), 0, &[]);
271        let engine =
272            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
273        let result = process_collected(
274            &collected,
275            progress,
276            Some(engine),
277            LicenseScanOptions::default(),
278            options,
279        );
280
281        result
282            .files
283            .into_iter()
284            .find(|entry| {
285                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
286            })
287            .expect("scanned file entry")
288    }
289
290    #[test]
291    fn scanner_reports_repeated_email_occurrences() {
292        let options = TextDetectionOptions {
293            collect_info: false,
294            detect_packages: false,
295            detect_application_packages: false,
296            detect_system_packages: false,
297            detect_packages_in_compiled: false,
298            detect_copyrights: false,
299            detect_generated: false,
300            detect_emails: true,
301            detect_urls: false,
302            max_emails: 50,
303            max_urls: 50,
304            timeout_seconds: 120.0,
305        };
306        let scanned = scan_single_file(
307            "contacts.txt",
308            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
309            &options,
310        );
311
312        let emails: Vec<(&str, usize)> = scanned
313            .emails
314            .iter()
315            .map(|email| (email.email.as_str(), email.start_line.get()))
316            .collect();
317
318        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
319        assert_eq!(
320            emails,
321            vec![
322                ("linux@3ware.com", 1),
323                ("linux@3ware.com", 2),
324                ("andre@suse.com", 3),
325                ("linux@3ware.com", 4),
326            ]
327        );
328    }
329
330    #[test]
331    fn scanner_skips_pem_certificate_text_detection() {
332        let options = TextDetectionOptions {
333            collect_info: false,
334            detect_packages: false,
335            detect_application_packages: false,
336            detect_system_packages: false,
337            detect_packages_in_compiled: false,
338            detect_copyrights: true,
339            detect_generated: false,
340            detect_emails: true,
341            detect_urls: true,
342            max_emails: 50,
343            max_urls: 50,
344            timeout_seconds: 120.0,
345        };
346        let pem_fixture = concat!(
347            "-----BEGIN CERTIFICATE-----\n",
348            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
349            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
350            "-----END CERTIFICATE-----\n",
351            "Certificate:\n",
352            "    Data:\n",
353            "        Signature Algorithm: sha1WithRSAEncryption\n",
354            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
355            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
356            "        Contact: cert-owner@example.com\n",
357        );
358        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
359
360        assert!(
361            scanned.copyrights.is_empty(),
362            "copyrights: {:#?}",
363            scanned.copyrights
364        );
365        assert!(
366            scanned.holders.is_empty(),
367            "holders: {:#?}",
368            scanned.holders
369        );
370        assert!(
371            scanned.authors.is_empty(),
372            "authors: {:#?}",
373            scanned.authors
374        );
375        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
376        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
377        assert!(
378            scanned.license_detections.is_empty(),
379            "licenses: {:#?}",
380            scanned.license_detections
381        );
382        assert!(
383            scanned.license_clues.is_empty(),
384            "license clues: {:#?}",
385            scanned.license_clues
386        );
387    }
388
389    #[test]
390    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
391        let options = TextDetectionOptions {
392            collect_info: false,
393            detect_packages: false,
394            detect_application_packages: false,
395            detect_system_packages: false,
396            detect_packages_in_compiled: false,
397            detect_copyrights: true,
398            detect_generated: false,
399            detect_emails: false,
400            detect_urls: true,
401            max_emails: 50,
402            max_urls: 50,
403            timeout_seconds: 120.0,
404        };
405        let fixture = concat!(
406            "/*\n",
407            "Copyright 2022 The Kubernetes Authors.\n\n",
408            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
409            "you may not use this file except in compliance with the License.\n",
410            "You may obtain a copy of the License at\n\n",
411            "    http://www.apache.org/licenses/LICENSE-2.0\n",
412            "*/\n\n",
413            "package storage\n\n",
414            "const validCert = `\n",
415            "-----BEGIN CERTIFICATE-----\n",
416            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
417            "-----END CERTIFICATE-----\n",
418            "`\n",
419        );
420        let temp_dir = TempDir::new().expect("create temp dir");
421        let file_path = temp_dir.path().join("storage_test.go");
422        fs::write(&file_path, fixture).expect("write fixture");
423
424        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
425        let collected = collect_paths(temp_dir.path(), 0, &[]);
426        let engine =
427            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
428        let result = process_collected(
429            &collected,
430            progress,
431            Some(engine),
432            LicenseScanOptions::default(),
433            &options,
434        );
435        let scanned = result
436            .files
437            .into_iter()
438            .find(|entry| {
439                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
440            })
441            .expect("scanned file entry");
442
443        assert!(
444            scanned
445                .copyrights
446                .iter()
447                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
448            "copyrights: {:#?}",
449            scanned.copyrights
450        );
451        assert!(
452            scanned
453                .holders
454                .iter()
455                .any(|h| h.holder == "The Kubernetes Authors"),
456            "holders: {:#?}",
457            scanned.holders
458        );
459        assert!(
460            scanned
461                .urls
462                .iter()
463                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
464            "urls: {:#?}",
465            scanned.urls
466        );
467        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
468    }
469
470    #[test]
471    fn scanner_detects_structured_credits_authors() {
472        let options = TextDetectionOptions {
473            collect_info: false,
474            detect_packages: false,
475            detect_application_packages: false,
476            detect_system_packages: false,
477            detect_packages_in_compiled: false,
478            detect_copyrights: true,
479            detect_generated: false,
480            detect_emails: false,
481            detect_urls: false,
482            max_emails: 50,
483            max_urls: 50,
484            timeout_seconds: 120.0,
485        };
486        let credits_fixture = concat!(
487            "N: Jack Lloyd\n",
488            "E: lloyd@randombit.net\n",
489            "W: http://www.randombit.net/\n",
490        );
491        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
492
493        let authors: Vec<(&str, usize, usize)> = scanned
494            .authors
495            .iter()
496            .map(|author| {
497                (
498                    author.author.as_str(),
499                    author.start_line.get(),
500                    author.end_line.get(),
501                )
502            })
503            .collect();
504
505        assert_eq!(
506            authors,
507            vec![(
508                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
509                1,
510                3,
511            )]
512        );
513        assert!(scanned.copyrights.is_empty());
514        assert!(scanned.holders.is_empty());
515    }
516
517    #[test]
518    fn scanner_uses_or_for_alternative_license_header() {
519        let fixture =
520            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
521        let temp_dir = TempDir::new().expect("create temp dir");
522        let file_path = temp_dir.path().join("d2s.ipp");
523        fs::write(&file_path, fixture).expect("write fixture");
524
525        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
526        let collected = collect_paths(temp_dir.path(), 0, &[]);
527        let engine =
528            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
529        let result = process_collected(
530            &collected,
531            progress,
532            Some(engine),
533            LicenseScanOptions::default(),
534            &TextDetectionOptions::default(),
535        );
536        let scanned = result
537            .files
538            .into_iter()
539            .find(|entry| {
540                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
541            })
542            .expect("scanned file entry");
543
544        assert_eq!(
545            scanned.license_expression.as_deref(),
546            Some("Apache-2.0 OR BSL-1.0")
547        );
548        assert!(
549            scanned.license_clues.is_empty(),
550            "license clues: {:#?}",
551            scanned.license_clues
552        );
553        assert_eq!(
554            scanned.license_detections.len(),
555            1,
556            "detections: {:#?}",
557            scanned.license_detections
558        );
559
560        let detection = &scanned.license_detections[0];
561        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
562
563        let match_expressions: Vec<_> = detection
564            .matches
565            .iter()
566            .map(|m| m.license_expression_spdx.as_str())
567            .collect();
568        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
569    }
570
571    #[test]
572    fn scanner_sets_generated_flag_when_enabled() {
573        let options = TextDetectionOptions {
574            collect_info: false,
575            detect_packages: false,
576            detect_application_packages: false,
577            detect_system_packages: false,
578            detect_packages_in_compiled: false,
579            detect_copyrights: false,
580            detect_generated: true,
581            detect_emails: false,
582            detect_urls: false,
583            max_emails: 50,
584            max_urls: 50,
585            timeout_seconds: 120.0,
586        };
587        let scanned = scan_single_file(
588            "generated.c",
589            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
590            &options,
591        );
592
593        assert_eq!(scanned.is_generated, Some(true));
594    }
595
596    #[test]
597    fn scanner_leaves_generated_flag_unset_when_disabled() {
598        let options = TextDetectionOptions {
599            collect_info: false,
600            detect_packages: false,
601            detect_application_packages: false,
602            detect_system_packages: false,
603            detect_packages_in_compiled: false,
604            detect_copyrights: false,
605            detect_generated: false,
606            detect_emails: false,
607            detect_urls: false,
608            max_emails: 50,
609            max_urls: 50,
610            timeout_seconds: 120.0,
611        };
612        let scanned = scan_single_file(
613            "generated.c",
614            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
615            &options,
616        );
617
618        assert_eq!(scanned.is_generated, None);
619    }
620
621    #[test]
622    fn scanner_populates_info_surface_when_enabled() {
623        let options = TextDetectionOptions {
624            collect_info: true,
625            detect_packages: false,
626            detect_application_packages: false,
627            detect_system_packages: false,
628            detect_packages_in_compiled: false,
629            detect_copyrights: false,
630            detect_generated: false,
631            detect_emails: false,
632            detect_urls: false,
633            max_emails: 50,
634            max_urls: 50,
635            timeout_seconds: 120.0,
636        };
637        let scanned = scan_single_file(
638            "script.py",
639            "#!/usr/bin/env python3\nprint(\"hello\")\n",
640            &options,
641        );
642
643        assert!(scanned.sha1.is_some());
644        assert!(scanned.md5.is_some());
645        assert!(scanned.sha256.is_some());
646        assert!(scanned.sha1_git.is_some());
647        assert!(scanned.mime_type.is_some());
648        assert!(scanned.date.is_some());
649        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
650        assert_eq!(scanned.is_text, Some(true));
651        assert_eq!(scanned.is_script, Some(true));
652        assert_eq!(scanned.is_source, Some(true));
653    }
654
655    #[test]
656    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
657        let options = TextDetectionOptions {
658            collect_info: true,
659            detect_packages: false,
660            detect_application_packages: false,
661            detect_system_packages: false,
662            detect_packages_in_compiled: false,
663            detect_copyrights: false,
664            detect_generated: false,
665            detect_emails: false,
666            detect_urls: false,
667            max_emails: 50,
668            max_urls: 50,
669            timeout_seconds: 120.0,
670        };
671        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
672        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
673
674        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
675        assert_eq!(
676            scanned.file_type_label.as_deref(),
677            Some("python script, text executable")
678        );
679        assert_eq!(scanned.is_binary, Some(false));
680        assert_eq!(scanned.is_text, Some(true));
681        assert_eq!(scanned.is_script, Some(true));
682        assert_eq!(scanned.is_source, Some(true));
683    }
684
685    #[test]
686    fn scanner_skips_findings_for_zip_like_archives() {
687        let options = TextDetectionOptions {
688            collect_info: true,
689            detect_packages: false,
690            detect_application_packages: false,
691            detect_system_packages: false,
692            detect_packages_in_compiled: false,
693            detect_copyrights: true,
694            detect_generated: false,
695            detect_emails: true,
696            detect_urls: true,
697            max_emails: 50,
698            max_urls: 50,
699            timeout_seconds: 120.0,
700        };
701        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
702        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
703
704        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
705        assert_eq!(scanned.is_archive, Some(true));
706        assert!(scanned.license_detections.is_empty());
707        assert!(scanned.copyrights.is_empty());
708        assert!(scanned.emails.is_empty());
709        assert!(scanned.urls.is_empty());
710    }
711
712    #[test]
713    fn scanner_treats_typescript_sources_as_text_not_video_media() {
714        let options = TextDetectionOptions {
715            collect_info: true,
716            detect_packages: false,
717            detect_application_packages: false,
718            detect_system_packages: false,
719            detect_packages_in_compiled: false,
720            detect_copyrights: false,
721            detect_generated: false,
722            detect_emails: false,
723            detect_urls: false,
724            max_emails: 50,
725            max_urls: 50,
726            timeout_seconds: 120.0,
727        };
728        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
729
730        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
731        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
732        assert_eq!(
733            scanned.file_type_label.as_deref(),
734            Some("UTF-8 Unicode text")
735        );
736        assert_eq!(scanned.is_text, Some(true));
737        assert_eq!(scanned.is_media, Some(false));
738        assert_eq!(scanned.is_script, Some(false));
739        assert_eq!(scanned.is_source, Some(true));
740    }
741
742    #[test]
743    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
744        let options = TextDetectionOptions {
745            collect_info: true,
746            detect_packages: false,
747            detect_application_packages: false,
748            detect_system_packages: false,
749            detect_packages_in_compiled: false,
750            detect_copyrights: false,
751            detect_generated: false,
752            detect_emails: false,
753            detect_urls: false,
754            max_emails: 50,
755            max_urls: 50,
756            timeout_seconds: 120.0,
757        };
758        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
759
760        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
761        assert_eq!(
762            scanned.file_type_label.as_deref(),
763            Some("UTF-8 Unicode text")
764        );
765        assert_eq!(scanned.is_text, Some(true));
766        assert_eq!(scanned.is_media, Some(false));
767        assert_eq!(scanned.is_script, Some(false));
768        assert_eq!(scanned.is_source, Some(true));
769    }
770
771    #[test]
772    fn scanner_treats_empty_files_like_scancode_info_surface() {
773        let options = TextDetectionOptions {
774            collect_info: true,
775            detect_packages: false,
776            detect_application_packages: false,
777            detect_system_packages: false,
778            detect_packages_in_compiled: false,
779            detect_copyrights: false,
780            detect_generated: false,
781            detect_emails: false,
782            detect_urls: false,
783            max_emails: 50,
784            max_urls: 50,
785            timeout_seconds: 120.0,
786        };
787        let scanned = scan_single_file("test.txt", "", &options);
788
789        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
790        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
791        assert_eq!(scanned.programming_language, None);
792        assert_eq!(scanned.is_binary, Some(false));
793        assert_eq!(scanned.is_text, Some(true));
794        assert_eq!(scanned.is_archive, Some(false));
795        assert_eq!(scanned.is_media, Some(false));
796        assert_eq!(scanned.is_source, Some(false));
797        assert_eq!(scanned.is_script, Some(false));
798    }
799
800    #[test]
801    fn scanner_treats_package_json_as_text_not_source() {
802        let options = TextDetectionOptions {
803            collect_info: true,
804            detect_packages: false,
805            detect_application_packages: false,
806            detect_system_packages: false,
807            detect_packages_in_compiled: false,
808            detect_copyrights: false,
809            detect_generated: false,
810            detect_emails: false,
811            detect_urls: false,
812            max_emails: 50,
813            max_urls: 50,
814            timeout_seconds: 120.0,
815        };
816        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
817
818        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
819        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
820        assert_eq!(scanned.programming_language, None);
821        assert_eq!(scanned.is_text, Some(true));
822        assert_eq!(scanned.is_source, Some(false));
823        assert_eq!(scanned.is_script, Some(false));
824    }
825
826    #[test]
827    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
828        let options = TextDetectionOptions {
829            collect_info: true,
830            detect_packages: false,
831            detect_application_packages: false,
832            detect_system_packages: false,
833            detect_packages_in_compiled: false,
834            detect_copyrights: false,
835            detect_generated: false,
836            detect_emails: false,
837            detect_urls: false,
838            max_emails: 50,
839            max_urls: 50,
840            timeout_seconds: 120.0,
841        };
842
843        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
844        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
845
846        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
847        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
848        assert_eq!(gradle.is_source, Some(true));
849        assert_eq!(gradle.is_script, Some(false));
850
851        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
852        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
853        assert_eq!(nix.is_source, Some(true));
854        assert_eq!(nix.is_script, Some(false));
855    }
856
857    #[test]
858    fn scanner_treats_gitmodules_as_text_not_source() {
859        let options = TextDetectionOptions {
860            collect_info: true,
861            detect_packages: false,
862            detect_application_packages: false,
863            detect_system_packages: false,
864            detect_packages_in_compiled: false,
865            detect_copyrights: false,
866            detect_generated: false,
867            detect_emails: false,
868            detect_urls: false,
869            max_emails: 50,
870            max_urls: 50,
871            timeout_seconds: 120.0,
872        };
873        let scanned = scan_file_at_relative_path(
874            ".gitmodules",
875            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
876            &options,
877        );
878
879        assert_eq!(scanned.programming_language, None);
880        assert_eq!(
881            scanned.file_type_label.as_deref(),
882            Some("Git configuration text")
883        );
884        assert_eq!(scanned.is_text, Some(true));
885        assert_eq!(scanned.is_source, Some(false));
886        assert_eq!(scanned.is_script, Some(false));
887    }
888
889    #[test]
890    fn scanner_treats_javascript_shebang_files_as_scripts() {
891        let options = TextDetectionOptions {
892            collect_info: true,
893            detect_packages: false,
894            detect_application_packages: false,
895            detect_system_packages: false,
896            detect_packages_in_compiled: false,
897            detect_copyrights: false,
898            detect_generated: false,
899            detect_emails: false,
900            detect_urls: false,
901            max_emails: 50,
902            max_urls: 50,
903            timeout_seconds: 120.0,
904        };
905        let scanned = scan_file_at_relative_path(
906            "bin/run",
907            b"#!/usr/bin/env node\nconsole.log('hello');\n",
908            &options,
909        );
910
911        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
912        assert_eq!(
913            scanned.file_type_label.as_deref(),
914            Some("javascript script, UTF-8 Unicode text executable")
915        );
916        assert_eq!(scanned.is_script, Some(true));
917        assert_eq!(scanned.is_source, Some(true));
918    }
919
920    #[test]
921    fn scanner_treats_dockerfile_as_source() {
922        let options = TextDetectionOptions {
923            collect_info: true,
924            detect_packages: false,
925            detect_application_packages: false,
926            detect_system_packages: false,
927            detect_packages_in_compiled: false,
928            detect_copyrights: false,
929            detect_generated: false,
930            detect_emails: false,
931            detect_urls: false,
932            max_emails: 50,
933            max_urls: 50,
934            timeout_seconds: 120.0,
935        };
936        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
937
938        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
939        assert_eq!(
940            scanned.file_type_label.as_deref(),
941            Some("UTF-8 Unicode text")
942        );
943        assert_eq!(scanned.is_source, Some(true));
944        assert_eq!(scanned.is_script, Some(false));
945    }
946
947    #[test]
948    fn scanner_treats_makefile_as_text_not_source() {
949        let options = TextDetectionOptions {
950            collect_info: true,
951            detect_packages: false,
952            detect_application_packages: false,
953            detect_system_packages: false,
954            detect_packages_in_compiled: false,
955            detect_copyrights: false,
956            detect_generated: false,
957            detect_emails: false,
958            detect_urls: false,
959            max_emails: 50,
960            max_urls: 50,
961            timeout_seconds: 120.0,
962        };
963        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
964
965        assert_eq!(scanned.programming_language, None);
966        assert_eq!(
967            scanned.file_type_label.as_deref(),
968            Some("UTF-8 Unicode text")
969        );
970        assert_eq!(scanned.is_text, Some(true));
971        assert_eq!(scanned.is_source, Some(false));
972        assert_eq!(scanned.is_script, Some(false));
973    }
974
975    #[test]
976    fn scanner_omits_info_surface_when_disabled() {
977        let options = TextDetectionOptions {
978            collect_info: false,
979            detect_packages: false,
980            detect_application_packages: false,
981            detect_system_packages: false,
982            detect_packages_in_compiled: false,
983            detect_copyrights: false,
984            detect_generated: false,
985            detect_emails: false,
986            detect_urls: false,
987            max_emails: 50,
988            max_urls: 50,
989            timeout_seconds: 120.0,
990        };
991        let scanned = scan_single_file(
992            "script.py",
993            "#!/usr/bin/env python3\nprint(\"hello\")\n",
994            &options,
995        );
996
997        assert!(scanned.sha1.is_none());
998        assert!(scanned.md5.is_none());
999        assert!(scanned.sha256.is_none());
1000        assert!(scanned.sha1_git.is_none());
1001        assert!(scanned.mime_type.is_none());
1002        assert!(scanned.date.is_none());
1003        assert!(scanned.programming_language.is_none());
1004        assert!(scanned.is_binary.is_none());
1005        assert!(scanned.is_text.is_none());
1006        assert!(scanned.is_archive.is_none());
1007        assert!(scanned.is_media.is_none());
1008        assert!(scanned.is_script.is_none());
1009        assert!(scanned.is_source.is_none());
1010    }
1011
1012    #[test]
1013    fn scanner_skips_package_parsing_when_disabled() {
1014        let options = TextDetectionOptions {
1015            collect_info: false,
1016            detect_packages: false,
1017            detect_application_packages: false,
1018            detect_system_packages: false,
1019            detect_packages_in_compiled: false,
1020            detect_copyrights: false,
1021            detect_generated: false,
1022            detect_emails: false,
1023            detect_urls: false,
1024            max_emails: 50,
1025            max_urls: 50,
1026            timeout_seconds: 120.0,
1027        };
1028        let scanned = scan_single_file(
1029            "package.json",
1030            r#"{"name":"demo","version":"1.0.0"}"#,
1031            &options,
1032        );
1033
1034        assert!(
1035            scanned.package_data.is_empty(),
1036            "package_data: {:#?}",
1037            scanned.package_data
1038        );
1039    }
1040
1041    #[test]
1042    fn scanner_parses_package_manifests_when_enabled() {
1043        let options = TextDetectionOptions {
1044            collect_info: false,
1045            detect_packages: true,
1046            detect_application_packages: true,
1047            detect_system_packages: false,
1048            detect_packages_in_compiled: false,
1049            detect_copyrights: false,
1050            detect_generated: false,
1051            detect_emails: false,
1052            detect_urls: false,
1053            max_emails: 50,
1054            max_urls: 50,
1055            timeout_seconds: 120.0,
1056        };
1057        let scanned = scan_single_file(
1058            "package.json",
1059            r#"{"name":"demo","version":"1.0.0"}"#,
1060            &options,
1061        );
1062
1063        assert_eq!(
1064            scanned.package_data.len(),
1065            1,
1066            "package_data: {:#?}",
1067            scanned.package_data
1068        );
1069    }
1070
1071    #[test]
1072    fn scanner_parses_oversized_rpm_in_package_only_mode_without_size_warning() {
1073        let temp_dir = TempDir::new().expect("create temp dir");
1074        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-demo");
1075
1076        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1077        let collected = collect_paths(temp_dir.path(), 0, &[]);
1078        let result = process_collected(
1079            &collected,
1080            progress,
1081            None,
1082            LicenseScanOptions::default(),
1083            &TextDetectionOptions {
1084                collect_info: false,
1085                detect_packages: true,
1086                detect_application_packages: true,
1087                detect_system_packages: false,
1088                detect_packages_in_compiled: false,
1089                detect_copyrights: false,
1090                detect_generated: false,
1091                detect_emails: false,
1092                detect_urls: false,
1093                max_emails: 50,
1094                max_urls: 50,
1095                timeout_seconds: 120.0,
1096            },
1097        );
1098
1099        let scanned = result
1100            .files
1101            .into_iter()
1102            .find(|entry| {
1103                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1104            })
1105            .expect("scanned file entry");
1106
1107        assert!(
1108            scanned.scan_errors.is_empty(),
1109            "scan_errors: {:#?}",
1110            scanned.scan_errors
1111        );
1112        assert_eq!(
1113            scanned.package_data.len(),
1114            1,
1115            "package_data: {:#?}",
1116            scanned.package_data
1117        );
1118        assert_eq!(
1119            scanned.package_data[0].datasource_id,
1120            Some(DatasourceId::RpmArchive)
1121        );
1122        assert_eq!(
1123            scanned.package_data[0].name.as_deref(),
1124            Some("oversized-demo")
1125        );
1126        assert_eq!(scanned.package_data[0].version.as_deref(), Some("1.0-1"));
1127    }
1128
1129    #[test]
1130    fn scanner_parses_oversized_rpm_with_info_without_timeout_or_size_warning() {
1131        let temp_dir = TempDir::new().expect("create temp dir");
1132        let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-info-demo");
1133
1134        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1135        let collected = collect_paths(temp_dir.path(), 0, &[]);
1136        let result = process_collected(
1137            &collected,
1138            progress,
1139            None,
1140            LicenseScanOptions::default(),
1141            &TextDetectionOptions {
1142                collect_info: true,
1143                detect_packages: true,
1144                detect_application_packages: true,
1145                detect_system_packages: false,
1146                detect_packages_in_compiled: false,
1147                detect_copyrights: false,
1148                detect_generated: false,
1149                detect_emails: false,
1150                detect_urls: false,
1151                max_emails: 50,
1152                max_urls: 50,
1153                timeout_seconds: 120.0,
1154            },
1155        );
1156
1157        let scanned = result
1158            .files
1159            .into_iter()
1160            .find(|entry| {
1161                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1162            })
1163            .expect("scanned file entry");
1164
1165        assert!(
1166            scanned.scan_errors.is_empty(),
1167            "scan_errors: {:#?}",
1168            scanned.scan_errors
1169        );
1170        assert_eq!(
1171            scanned.package_data.len(),
1172            1,
1173            "package_data: {:#?}",
1174            scanned.package_data
1175        );
1176        assert_eq!(
1177            scanned.package_data[0].datasource_id,
1178            Some(DatasourceId::RpmArchive)
1179        );
1180        assert_eq!(
1181            scanned.package_data[0].name.as_deref(),
1182            Some("oversized-info-demo")
1183        );
1184        assert!(scanned.sha1.is_some());
1185        assert!(scanned.md5.is_some());
1186        assert!(scanned.sha256.is_some());
1187        assert!(scanned.sha1_git.is_some());
1188        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1189        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1190        assert_eq!(scanned.is_binary, Some(true));
1191        assert_eq!(scanned.is_text, Some(false));
1192        assert_eq!(scanned.is_archive, Some(true));
1193    }
1194
1195    #[test]
1196    fn scanner_parses_oversized_pack_rpm_in_package_only_mode_without_size_warning() {
1197        let temp_dir = TempDir::new().expect("create temp dir");
1198        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-demo");
1199
1200        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1201        let collected = collect_paths(temp_dir.path(), 0, &[]);
1202        let result = process_collected(
1203            &collected,
1204            progress,
1205            None,
1206            LicenseScanOptions::default(),
1207            &TextDetectionOptions {
1208                collect_info: false,
1209                detect_packages: true,
1210                detect_application_packages: true,
1211                detect_system_packages: false,
1212                detect_packages_in_compiled: false,
1213                detect_copyrights: false,
1214                detect_generated: false,
1215                detect_emails: false,
1216                detect_urls: false,
1217                max_emails: 50,
1218                max_urls: 50,
1219                timeout_seconds: 120.0,
1220            },
1221        );
1222
1223        let scanned = result
1224            .files
1225            .into_iter()
1226            .find(|entry| {
1227                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1228            })
1229            .expect("scanned file entry");
1230
1231        assert!(
1232            scanned.scan_errors.is_empty(),
1233            "scan_errors: {:#?}",
1234            scanned.scan_errors
1235        );
1236        assert_eq!(
1237            scanned.package_data.len(),
1238            1,
1239            "package_data: {:#?}",
1240            scanned.package_data
1241        );
1242        assert_eq!(
1243            scanned.package_data[0].datasource_id,
1244            Some(DatasourceId::RpmArchive)
1245        );
1246        assert_eq!(
1247            scanned.package_data[0].name.as_deref(),
1248            Some("oversized-pack-demo")
1249        );
1250    }
1251
1252    #[test]
1253    fn scanner_parses_oversized_pack_rpm_with_info_without_timeout_or_size_warning() {
1254        let temp_dir = TempDir::new().expect("create temp dir");
1255        let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-info-demo");
1256
1257        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1258        let collected = collect_paths(temp_dir.path(), 0, &[]);
1259        let result = process_collected(
1260            &collected,
1261            progress,
1262            None,
1263            LicenseScanOptions::default(),
1264            &TextDetectionOptions {
1265                collect_info: true,
1266                detect_packages: true,
1267                detect_application_packages: true,
1268                detect_system_packages: false,
1269                detect_packages_in_compiled: false,
1270                detect_copyrights: false,
1271                detect_generated: false,
1272                detect_emails: false,
1273                detect_urls: false,
1274                max_emails: 50,
1275                max_urls: 50,
1276                timeout_seconds: 120.0,
1277            },
1278        );
1279
1280        let scanned = result
1281            .files
1282            .into_iter()
1283            .find(|entry| {
1284                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1285            })
1286            .expect("scanned file entry");
1287
1288        assert!(
1289            scanned.scan_errors.is_empty(),
1290            "scan_errors: {:#?}",
1291            scanned.scan_errors
1292        );
1293        assert_eq!(
1294            scanned.package_data.len(),
1295            1,
1296            "package_data: {:#?}",
1297            scanned.package_data
1298        );
1299        assert_eq!(
1300            scanned.package_data[0].datasource_id,
1301            Some(DatasourceId::RpmArchive)
1302        );
1303        assert_eq!(
1304            scanned.package_data[0].name.as_deref(),
1305            Some("oversized-pack-info-demo")
1306        );
1307        assert!(scanned.sha1.is_some());
1308        assert!(scanned.md5.is_some());
1309        assert!(scanned.sha256.is_some());
1310        assert!(scanned.sha1_git.is_some());
1311        assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1312        assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1313        assert_eq!(scanned.is_binary, Some(true));
1314        assert_eq!(scanned.is_text, Some(false));
1315        assert_eq!(scanned.is_archive, Some(true));
1316    }
1317
1318    #[test]
1319    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1320        let options = TextDetectionOptions {
1321            collect_info: false,
1322            detect_packages: true,
1323            detect_application_packages: false,
1324            detect_system_packages: true,
1325            detect_packages_in_compiled: false,
1326            detect_copyrights: false,
1327            detect_generated: false,
1328            detect_emails: false,
1329            detect_urls: false,
1330            max_emails: 50,
1331            max_urls: 50,
1332            timeout_seconds: 120.0,
1333        };
1334        let scanned = scan_single_file(
1335            "package.json",
1336            r#"{"name":"demo","version":"1.0.0"}"#,
1337            &options,
1338        );
1339
1340        assert!(
1341            scanned.package_data.is_empty(),
1342            "package_data: {:#?}",
1343            scanned.package_data
1344        );
1345    }
1346
1347    #[test]
1348    fn scanner_parses_system_package_files_when_enabled() {
1349        let options = TextDetectionOptions {
1350            collect_info: false,
1351            detect_packages: true,
1352            detect_application_packages: false,
1353            detect_system_packages: true,
1354            detect_packages_in_compiled: false,
1355            detect_copyrights: false,
1356            detect_generated: false,
1357            detect_emails: false,
1358            detect_urls: false,
1359            max_emails: 50,
1360            max_urls: 50,
1361            timeout_seconds: 120.0,
1362        };
1363        let scanned = scan_file_at_relative_path(
1364            "var/lib/dpkg/status",
1365            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1366            &options,
1367        );
1368
1369        assert!(
1370            !scanned.package_data.is_empty(),
1371            "package_data: {:#?}",
1372            scanned.package_data
1373        );
1374    }
1375
1376    #[test]
1377    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1378        if std::process::Command::new("go")
1379            .arg("version")
1380            .status()
1381            .is_err()
1382        {
1383            return;
1384        }
1385
1386        let temp_dir = TempDir::new().expect("create temp dir");
1387        fs::write(
1388            temp_dir.path().join("go.mod"),
1389            "module example.com/demo\n\ngo 1.23.0\n",
1390        )
1391        .expect("write go.mod");
1392        fs::write(
1393            temp_dir.path().join("main.go"),
1394            "package main\nfunc main() {}\n",
1395        )
1396        .expect("write main.go");
1397        let file_path = temp_dir.path().join("demo");
1398        let status = std::process::Command::new("go")
1399            .current_dir(temp_dir.path())
1400            .args(["build", "-o"])
1401            .arg(&file_path)
1402            .status()
1403            .expect("run go build");
1404        assert!(status.success());
1405
1406        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1407        let collected = collect_paths(temp_dir.path(), 0, &[]);
1408
1409        let without_compiled = process_collected(
1410            &collected,
1411            Arc::clone(&progress),
1412            None,
1413            LicenseScanOptions::default(),
1414            &TextDetectionOptions {
1415                collect_info: false,
1416                detect_packages: true,
1417                detect_application_packages: true,
1418                detect_system_packages: false,
1419                detect_packages_in_compiled: false,
1420                detect_copyrights: false,
1421                detect_generated: false,
1422                detect_emails: false,
1423                detect_urls: false,
1424                max_emails: 50,
1425                max_urls: 50,
1426                timeout_seconds: 120.0,
1427            },
1428        );
1429        let with_compiled = process_collected(
1430            &collected,
1431            progress,
1432            None,
1433            LicenseScanOptions::default(),
1434            &TextDetectionOptions {
1435                collect_info: false,
1436                detect_packages: true,
1437                detect_application_packages: true,
1438                detect_system_packages: false,
1439                detect_packages_in_compiled: true,
1440                detect_copyrights: false,
1441                detect_generated: false,
1442                detect_emails: false,
1443                detect_urls: false,
1444                max_emails: 50,
1445                max_urls: 50,
1446                timeout_seconds: 120.0,
1447            },
1448        );
1449
1450        let without_compiled = without_compiled
1451            .files
1452            .into_iter()
1453            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1454            .expect("compiled artifact present");
1455        let with_compiled = with_compiled
1456            .files
1457            .into_iter()
1458            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1459            .expect("compiled artifact present");
1460
1461        assert!(
1462            without_compiled.package_data.is_empty(),
1463            "package_data: {:#?}",
1464            without_compiled.package_data
1465        );
1466        assert!(!with_compiled.package_data.is_empty());
1467    }
1468
1469    #[test]
1470    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1471        let temp_dir = TempDir::new().expect("create temp dir");
1472        let file_path = temp_dir.path().join("libiconv2.dll");
1473        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1474            .expect("read PE fixture");
1475        fs::write(&file_path, fixture).expect("write PE fixture");
1476
1477        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1478        let collected = collect_paths(temp_dir.path(), 0, &[]);
1479
1480        let without_package = process_collected(
1481            &collected,
1482            Arc::clone(&progress),
1483            None,
1484            LicenseScanOptions::default(),
1485            &TextDetectionOptions {
1486                collect_info: false,
1487                detect_packages: false,
1488                detect_application_packages: false,
1489                detect_system_packages: false,
1490                detect_packages_in_compiled: false,
1491                detect_copyrights: false,
1492                detect_generated: false,
1493                detect_emails: false,
1494                detect_urls: false,
1495                max_emails: 50,
1496                max_urls: 50,
1497                timeout_seconds: 120.0,
1498            },
1499        );
1500        let with_package = process_collected(
1501            &collected,
1502            progress,
1503            None,
1504            LicenseScanOptions::default(),
1505            &TextDetectionOptions {
1506                collect_info: false,
1507                detect_packages: true,
1508                detect_application_packages: true,
1509                detect_system_packages: false,
1510                detect_packages_in_compiled: false,
1511                detect_copyrights: false,
1512                detect_generated: false,
1513                detect_emails: false,
1514                detect_urls: false,
1515                max_emails: 50,
1516                max_urls: 50,
1517                timeout_seconds: 120.0,
1518            },
1519        );
1520
1521        let without_package = without_package
1522            .files
1523            .into_iter()
1524            .find(|entry| {
1525                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1526            })
1527            .expect("compiled artifact present");
1528        let with_package = with_package
1529            .files
1530            .into_iter()
1531            .find(|entry| {
1532                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1533            })
1534            .expect("compiled artifact present");
1535
1536        assert!(without_package.package_data.is_empty());
1537        assert_eq!(with_package.package_data.len(), 1);
1538        assert_eq!(
1539            with_package.package_data[0].package_type,
1540            Some(FilePackageType::Winexe)
1541        );
1542        assert_eq!(
1543            with_package.package_data[0].datasource_id,
1544            Some(DatasourceId::WindowsExecutable)
1545        );
1546    }
1547
1548    #[test]
1549    fn scanner_detects_license_from_font_metadata() {
1550        let temp_dir = TempDir::new().expect("create temp dir");
1551        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1552        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1553        fs::write(&file_path, fixture).expect("write font fixture");
1554
1555        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1556        let collected = collect_paths(temp_dir.path(), 0, &[]);
1557        let engine =
1558            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1559        let result = process_collected(
1560            &collected,
1561            progress,
1562            Some(engine),
1563            LicenseScanOptions::default(),
1564            &TextDetectionOptions::default(),
1565        );
1566        let scanned = result
1567            .files
1568            .into_iter()
1569            .find(|entry| {
1570                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1571            })
1572            .expect("scanned file entry");
1573
1574        assert!(
1575            scanned.license_expression.is_some(),
1576            "license detections: {:#?}",
1577            scanned.license_detections
1578        );
1579        assert!(
1580            scanned
1581                .license_expression
1582                .as_deref()
1583                .is_some_and(
1584                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1585                ),
1586            "license expression: {:?}",
1587            scanned.license_expression
1588        );
1589    }
1590
1591    #[test]
1592    fn scanner_detects_license_from_windows_executable_metadata() {
1593        let temp_dir = TempDir::new().expect("create temp dir");
1594        let file_path = temp_dir.path().join("libiconv2.dll");
1595        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1596            .expect("read PE fixture");
1597        fs::write(&file_path, fixture).expect("write PE fixture");
1598
1599        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1600        let collected = collect_paths(temp_dir.path(), 0, &[]);
1601        let engine =
1602            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1603        let result = process_collected(
1604            &collected,
1605            progress,
1606            Some(engine),
1607            LicenseScanOptions::default(),
1608            &TextDetectionOptions::default(),
1609        );
1610        let scanned = result
1611            .files
1612            .into_iter()
1613            .find(|entry| {
1614                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1615            })
1616            .expect("scanned file entry");
1617
1618        assert!(
1619            scanned.license_expression.is_some(),
1620            "license detections: {:#?}",
1621            scanned.license_detections
1622        );
1623        assert!(
1624            scanned
1625                .license_expression
1626                .as_deref()
1627                .is_some_and(|expression| {
1628                    expression.contains("lgpl") || expression.contains("LGPL")
1629                }),
1630            "license expression: {:?}",
1631            scanned.license_expression
1632        );
1633    }
1634
1635    #[test]
1636    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1637        let scanned = scan_single_file_with_license_engine(
1638            "navbar.md",
1639            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1640            &TextDetectionOptions::default(),
1641        );
1642
1643        assert!(
1644            scanned
1645                .license_expression
1646                .as_deref()
1647                .is_some_and(|expression| {
1648                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1649                }),
1650            "license expression: {:?}",
1651            scanned.license_expression
1652        );
1653    }
1654
1655    #[test]
1656    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1657        let scanned = scan_single_file_with_license_engine(
1658            "README.md",
1659            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1660            &TextDetectionOptions::default(),
1661        );
1662
1663        assert!(
1664            scanned
1665                .license_expression
1666                .as_deref()
1667                .is_some_and(|expression| {
1668                    expression.contains("mit") || expression.contains("MIT")
1669                }),
1670            "license expression: {:?}",
1671            scanned.license_expression
1672        );
1673    }
1674
1675    #[test]
1676    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1677        let scanned = scan_single_file_with_license_engine(
1678            "README.md",
1679            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1680            &TextDetectionOptions::default(),
1681        );
1682
1683        assert!(
1684            scanned
1685                .license_expression
1686                .as_deref()
1687                .is_some_and(|expression| {
1688                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1689                }),
1690            "license expression: {:?}",
1691            scanned.license_expression
1692        );
1693    }
1694
1695    #[test]
1696    fn scanner_sets_is_source_only_when_info_enabled() {
1697        let without_info = TextDetectionOptions {
1698            collect_info: false,
1699            detect_packages: false,
1700            detect_application_packages: false,
1701            detect_system_packages: false,
1702            detect_packages_in_compiled: false,
1703            detect_copyrights: false,
1704            detect_generated: false,
1705            detect_emails: false,
1706            detect_urls: false,
1707            max_emails: 50,
1708            max_urls: 50,
1709            timeout_seconds: 120.0,
1710        };
1711        let with_info = TextDetectionOptions {
1712            collect_info: true,
1713            ..without_info.clone()
1714        };
1715
1716        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1717        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1718
1719        assert_eq!(scanned_without_info.is_source, None);
1720        assert_eq!(scanned_with_info.is_source, Some(true));
1721    }
1722
1723    #[test]
1724    fn directory_omits_info_fields_when_info_disabled() {
1725        let temp_dir = TempDir::new().expect("create temp dir");
1726        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1727
1728        let collected = collect_paths(temp_dir.path(), 0, &[]);
1729        let result = process_collected(
1730            &collected,
1731            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1732            None,
1733            LicenseScanOptions::default(),
1734            &TextDetectionOptions {
1735                collect_info: false,
1736                detect_packages: false,
1737                detect_application_packages: false,
1738                detect_system_packages: false,
1739                detect_packages_in_compiled: false,
1740                detect_copyrights: false,
1741                detect_generated: false,
1742                detect_emails: false,
1743                detect_urls: false,
1744                max_emails: 50,
1745                max_urls: 50,
1746                timeout_seconds: 120.0,
1747            },
1748        );
1749
1750        let directory = result
1751            .files
1752            .into_iter()
1753            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1754            .expect("directory entry");
1755
1756        assert!(directory.date.is_none());
1757        assert!(directory.file_type_label.is_none());
1758        assert!(directory.is_binary.is_none());
1759        assert!(directory.is_text.is_none());
1760        assert!(directory.is_archive.is_none());
1761        assert!(directory.is_media.is_none());
1762        assert!(directory.is_source.is_none());
1763        assert!(directory.is_script.is_none());
1764    }
1765
1766    #[test]
1767    fn directory_includes_info_fields_when_info_enabled() {
1768        let temp_dir = TempDir::new().expect("create temp dir");
1769        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1770
1771        let collected = collect_paths(temp_dir.path(), 0, &[]);
1772        let result = process_collected(
1773            &collected,
1774            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1775            None,
1776            LicenseScanOptions::default(),
1777            &TextDetectionOptions {
1778                collect_info: true,
1779                detect_packages: false,
1780                detect_application_packages: false,
1781                detect_system_packages: false,
1782                detect_packages_in_compiled: false,
1783                detect_copyrights: false,
1784                detect_generated: false,
1785                detect_emails: false,
1786                detect_urls: false,
1787                max_emails: 50,
1788                max_urls: 50,
1789                timeout_seconds: 120.0,
1790            },
1791        );
1792
1793        let directory = result
1794            .files
1795            .into_iter()
1796            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1797            .expect("directory entry");
1798
1799        assert!(directory.date.is_none());
1800        assert!(directory.file_type_label.is_none());
1801        assert_eq!(directory.is_binary, Some(false));
1802        assert_eq!(directory.is_text, Some(false));
1803        assert_eq!(directory.is_archive, Some(false));
1804        assert_eq!(directory.is_media, Some(false));
1805        assert_eq!(directory.is_source, Some(false));
1806        assert_eq!(directory.is_script, Some(false));
1807        assert_eq!(directory.files_count, Some(0));
1808        assert_eq!(directory.dirs_count, Some(0));
1809        assert_eq!(directory.size_count, Some(0));
1810    }
1811
1812    #[test]
1813    fn collect_paths_includes_root_directory_entry() {
1814        let temp_dir = TempDir::new().expect("create temp dir");
1815        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1816        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1817            .expect("write nested file");
1818
1819        let collected = collect_paths(temp_dir.path(), 0, &[]);
1820
1821        assert!(
1822            collected
1823                .directories
1824                .iter()
1825                .any(|(path, _)| path == temp_dir.path())
1826        );
1827    }
1828
1829    #[test]
1830    fn collect_paths_supports_single_file_input() {
1831        let temp_dir = TempDir::new().expect("create temp dir");
1832        let file_path = temp_dir.path().join("main.rs");
1833        fs::write(&file_path, "fn main() {}\n").expect("write file");
1834
1835        let collected = collect_paths(&file_path, 0, &[]);
1836
1837        assert_eq!(collected.files.len(), 1);
1838        assert!(collected.directories.is_empty());
1839        assert_eq!(collected.files[0].0, file_path);
1840    }
1841
1842    #[test]
1843    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1844        let temp_dir = TempDir::new().expect("create temp dir");
1845        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1846        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1847
1848        let collected = collect_paths(temp_dir.path(), 0, &[]);
1849        let result = process_collected_with_memory_limit(
1850            &collected,
1851            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1852            None,
1853            LicenseScanOptions::default(),
1854            &TextDetectionOptions {
1855                collect_info: false,
1856                detect_packages: false,
1857                detect_application_packages: false,
1858                detect_system_packages: false,
1859                detect_packages_in_compiled: false,
1860                detect_copyrights: false,
1861                detect_generated: false,
1862                detect_emails: false,
1863                detect_urls: false,
1864                max_emails: 50,
1865                max_urls: 50,
1866                timeout_seconds: 120.0,
1867            },
1868            MemoryMode::Limit(1),
1869        );
1870
1871        assert_eq!(result.files.len(), 3);
1872    }
1873
1874    #[test]
1875    fn process_collected_with_negative_one_uses_disk_only_mode() {
1876        let temp_dir = TempDir::new().expect("create temp dir");
1877        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1878
1879        let collected = collect_paths(temp_dir.path(), 0, &[]);
1880        let result = process_collected_with_memory_limit(
1881            &collected,
1882            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1883            None,
1884            LicenseScanOptions::default(),
1885            &TextDetectionOptions {
1886                collect_info: false,
1887                detect_packages: false,
1888                detect_application_packages: false,
1889                detect_system_packages: false,
1890                detect_packages_in_compiled: false,
1891                detect_copyrights: false,
1892                detect_generated: false,
1893                detect_emails: false,
1894                detect_urls: false,
1895                max_emails: 50,
1896                max_urls: 50,
1897                timeout_seconds: 120.0,
1898            },
1899            MemoryMode::StreamUnlimited,
1900        );
1901
1902        assert_eq!(result.files.len(), 2);
1903    }
1904}