Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use crate::license_detection::LicenseDetectionEngine;
5use crate::models::FileInfo;
6
7pub struct ProcessResult {
8    pub files: Vec<FileInfo>,
9    pub excluded_count: usize,
10}
11
12#[derive(Debug, Clone, Copy, Default)]
13pub struct LicenseScanOptions {
14    pub include_text: bool,
15    pub include_text_diagnostics: bool,
16    pub include_diagnostics: bool,
17    pub unknown_licenses: bool,
18    pub min_score: u8,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23    pub collect_info: bool,
24    pub detect_packages: bool,
25    pub detect_application_packages: bool,
26    pub detect_system_packages: bool,
27    pub detect_packages_in_compiled: bool,
28    pub detect_copyrights: bool,
29    pub detect_generated: bool,
30    pub detect_emails: bool,
31    pub detect_urls: bool,
32    pub max_emails: usize,
33    pub max_urls: usize,
34    pub timeout_seconds: f64,
35}
36
37impl Default for TextDetectionOptions {
38    fn default() -> Self {
39        Self {
40            collect_info: false,
41            detect_packages: false,
42            detect_application_packages: false,
43            detect_system_packages: false,
44            detect_packages_in_compiled: false,
45            detect_copyrights: true,
46            detect_generated: false,
47            detect_emails: false,
48            detect_urls: false,
49            max_emails: 50,
50            max_urls: 50,
51            timeout_seconds: 120.0,
52        }
53    }
54}
55
56pub fn scan_options_fingerprint(
57    text_options: &TextDetectionOptions,
58    license_options: LicenseScanOptions,
59    license_engine: Option<&LicenseDetectionEngine>,
60) -> String {
61    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
62        Some(engine) => {
63            let rules = &engine.index().rules_by_rid;
64            (
65                true,
66                rules.len(),
67                rules
68                    .first()
69                    .map(|rule| rule.identifier.as_str())
70                    .unwrap_or(""),
71                rules
72                    .last()
73                    .map(|rule| rule.identifier.as_str())
74                    .unwrap_or(""),
75            )
76        }
77        None => (false, 0, "", ""),
78    };
79
80    format!(
81        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
82        crate::version::BUILD_VERSION,
83        text_options.collect_info,
84        text_options.detect_packages,
85        text_options.detect_application_packages,
86        text_options.detect_system_packages,
87        text_options.detect_packages_in_compiled,
88        text_options.detect_copyrights,
89        text_options.detect_generated,
90        text_options.detect_emails,
91        text_options.detect_urls,
92        text_options.max_emails,
93        text_options.max_urls,
94        text_options.timeout_seconds,
95        license_enabled,
96        rules_count,
97        first_rule_id,
98        last_rule_id,
99        license_options.include_text,
100        license_options.include_text_diagnostics,
101        license_options.include_diagnostics,
102        license_options.unknown_licenses,
103        license_options.min_score,
104    )
105}
106
107pub use self::collect::{CollectedPaths, collect_paths};
108#[allow(unused_imports)]
109pub use self::process::{
110    MemoryMode, process_collected, process_collected_sequential,
111    process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
112};
113
114#[cfg(test)]
115mod tests {
116    use std::fs;
117    use std::sync::Arc;
118
119    use tempfile::TempDir;
120
121    use crate::license_detection::LicenseDetectionEngine;
122    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
123    use crate::progress::{ProgressMode, ScanProgress};
124
125    use super::{
126        LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths, process_collected,
127        process_collected_with_memory_limit,
128    };
129
130    #[test]
131    fn default_options_keep_copyright_detection_enabled() {
132        let options = TextDetectionOptions::default();
133        assert!(!options.detect_packages);
134        assert!(options.detect_copyrights);
135    }
136
137    fn scan_single_file(
138        file_name: &str,
139        content: &str,
140        options: &TextDetectionOptions,
141    ) -> crate::models::FileInfo {
142        let temp_dir = TempDir::new().expect("create temp dir");
143        let file_path = temp_dir.path().join(file_name);
144        fs::write(&file_path, content).expect("write test file");
145
146        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
147        let collected = collect_paths(temp_dir.path(), 0, &[]);
148        let result = process_collected(
149            &collected,
150            progress,
151            None,
152            LicenseScanOptions::default(),
153            options,
154        );
155
156        result
157            .files
158            .into_iter()
159            .find(|entry| {
160                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
161            })
162            .expect("scanned file entry")
163    }
164
165    fn scan_file_at_relative_path(
166        relative_path: &str,
167        content: &[u8],
168        options: &TextDetectionOptions,
169    ) -> crate::models::FileInfo {
170        let temp_dir = TempDir::new().expect("create temp dir");
171        let file_path = temp_dir.path().join(relative_path);
172        if let Some(parent) = file_path.parent() {
173            fs::create_dir_all(parent).expect("create parent dirs");
174        }
175        fs::write(&file_path, content).expect("write test file");
176
177        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
178        let collected = collect_paths(temp_dir.path(), 0, &[]);
179        let result = process_collected(
180            &collected,
181            progress,
182            None,
183            LicenseScanOptions::default(),
184            options,
185        );
186
187        result
188            .files
189            .into_iter()
190            .find(|entry| {
191                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
192            })
193            .expect("scanned file entry")
194    }
195
196    fn scan_single_file_with_license_engine(
197        file_name: &str,
198        content: &str,
199        options: &TextDetectionOptions,
200    ) -> crate::models::FileInfo {
201        let temp_dir = TempDir::new().expect("create temp dir");
202        let file_path = temp_dir.path().join(file_name);
203        fs::write(&file_path, content).expect("write test file");
204
205        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
206        let collected = collect_paths(temp_dir.path(), 0, &[]);
207        let engine =
208            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
209        let result = process_collected(
210            &collected,
211            progress,
212            Some(engine),
213            LicenseScanOptions::default(),
214            options,
215        );
216
217        result
218            .files
219            .into_iter()
220            .find(|entry| {
221                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
222            })
223            .expect("scanned file entry")
224    }
225
226    #[test]
227    fn scanner_reports_repeated_email_occurrences() {
228        let options = TextDetectionOptions {
229            collect_info: false,
230            detect_packages: false,
231            detect_application_packages: false,
232            detect_system_packages: false,
233            detect_packages_in_compiled: false,
234            detect_copyrights: false,
235            detect_generated: false,
236            detect_emails: true,
237            detect_urls: false,
238            max_emails: 50,
239            max_urls: 50,
240            timeout_seconds: 120.0,
241        };
242        let scanned = scan_single_file(
243            "contacts.txt",
244            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
245            &options,
246        );
247
248        let emails: Vec<(&str, usize)> = scanned
249            .emails
250            .iter()
251            .map(|email| (email.email.as_str(), email.start_line.get()))
252            .collect();
253
254        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
255        assert_eq!(
256            emails,
257            vec![
258                ("linux@3ware.com", 1),
259                ("linux@3ware.com", 2),
260                ("andre@suse.com", 3),
261                ("linux@3ware.com", 4),
262            ]
263        );
264    }
265
266    #[test]
267    fn scanner_skips_pem_certificate_text_detection() {
268        let options = TextDetectionOptions {
269            collect_info: false,
270            detect_packages: false,
271            detect_application_packages: false,
272            detect_system_packages: false,
273            detect_packages_in_compiled: false,
274            detect_copyrights: true,
275            detect_generated: false,
276            detect_emails: true,
277            detect_urls: true,
278            max_emails: 50,
279            max_urls: 50,
280            timeout_seconds: 120.0,
281        };
282        let pem_fixture = concat!(
283            "-----BEGIN CERTIFICATE-----\n",
284            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
285            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
286            "-----END CERTIFICATE-----\n",
287            "Certificate:\n",
288            "    Data:\n",
289            "        Signature Algorithm: sha1WithRSAEncryption\n",
290            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
291            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
292            "        Contact: cert-owner@example.com\n",
293        );
294        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
295
296        assert!(
297            scanned.copyrights.is_empty(),
298            "copyrights: {:#?}",
299            scanned.copyrights
300        );
301        assert!(
302            scanned.holders.is_empty(),
303            "holders: {:#?}",
304            scanned.holders
305        );
306        assert!(
307            scanned.authors.is_empty(),
308            "authors: {:#?}",
309            scanned.authors
310        );
311        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
312        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
313        assert!(
314            scanned.license_detections.is_empty(),
315            "licenses: {:#?}",
316            scanned.license_detections
317        );
318        assert!(
319            scanned.license_clues.is_empty(),
320            "license clues: {:#?}",
321            scanned.license_clues
322        );
323    }
324
325    #[test]
326    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
327        let options = TextDetectionOptions {
328            collect_info: false,
329            detect_packages: false,
330            detect_application_packages: false,
331            detect_system_packages: false,
332            detect_packages_in_compiled: false,
333            detect_copyrights: true,
334            detect_generated: false,
335            detect_emails: false,
336            detect_urls: true,
337            max_emails: 50,
338            max_urls: 50,
339            timeout_seconds: 120.0,
340        };
341        let fixture = concat!(
342            "/*\n",
343            "Copyright 2022 The Kubernetes Authors.\n\n",
344            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
345            "you may not use this file except in compliance with the License.\n",
346            "You may obtain a copy of the License at\n\n",
347            "    http://www.apache.org/licenses/LICENSE-2.0\n",
348            "*/\n\n",
349            "package storage\n\n",
350            "const validCert = `\n",
351            "-----BEGIN CERTIFICATE-----\n",
352            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
353            "-----END CERTIFICATE-----\n",
354            "`\n",
355        );
356        let temp_dir = TempDir::new().expect("create temp dir");
357        let file_path = temp_dir.path().join("storage_test.go");
358        fs::write(&file_path, fixture).expect("write fixture");
359
360        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
361        let collected = collect_paths(temp_dir.path(), 0, &[]);
362        let engine =
363            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
364        let result = process_collected(
365            &collected,
366            progress,
367            Some(engine),
368            LicenseScanOptions::default(),
369            &options,
370        );
371        let scanned = result
372            .files
373            .into_iter()
374            .find(|entry| {
375                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
376            })
377            .expect("scanned file entry");
378
379        assert!(
380            scanned
381                .copyrights
382                .iter()
383                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
384            "copyrights: {:#?}",
385            scanned.copyrights
386        );
387        assert!(
388            scanned
389                .holders
390                .iter()
391                .any(|h| h.holder == "The Kubernetes Authors"),
392            "holders: {:#?}",
393            scanned.holders
394        );
395        assert!(
396            scanned
397                .urls
398                .iter()
399                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
400            "urls: {:#?}",
401            scanned.urls
402        );
403        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
404    }
405
406    #[test]
407    fn scanner_detects_structured_credits_authors() {
408        let options = TextDetectionOptions {
409            collect_info: false,
410            detect_packages: false,
411            detect_application_packages: false,
412            detect_system_packages: false,
413            detect_packages_in_compiled: false,
414            detect_copyrights: true,
415            detect_generated: false,
416            detect_emails: false,
417            detect_urls: false,
418            max_emails: 50,
419            max_urls: 50,
420            timeout_seconds: 120.0,
421        };
422        let credits_fixture = concat!(
423            "N: Jack Lloyd\n",
424            "E: lloyd@randombit.net\n",
425            "W: http://www.randombit.net/\n",
426        );
427        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
428
429        let authors: Vec<(&str, usize, usize)> = scanned
430            .authors
431            .iter()
432            .map(|author| {
433                (
434                    author.author.as_str(),
435                    author.start_line.get(),
436                    author.end_line.get(),
437                )
438            })
439            .collect();
440
441        assert_eq!(
442            authors,
443            vec![(
444                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
445                1,
446                3,
447            )]
448        );
449        assert!(scanned.copyrights.is_empty());
450        assert!(scanned.holders.is_empty());
451    }
452
453    #[test]
454    fn scanner_uses_or_for_alternative_license_header() {
455        let fixture =
456            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
457        let temp_dir = TempDir::new().expect("create temp dir");
458        let file_path = temp_dir.path().join("d2s.ipp");
459        fs::write(&file_path, fixture).expect("write fixture");
460
461        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
462        let collected = collect_paths(temp_dir.path(), 0, &[]);
463        let engine =
464            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
465        let result = process_collected(
466            &collected,
467            progress,
468            Some(engine),
469            LicenseScanOptions::default(),
470            &TextDetectionOptions::default(),
471        );
472        let scanned = result
473            .files
474            .into_iter()
475            .find(|entry| {
476                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
477            })
478            .expect("scanned file entry");
479
480        assert_eq!(
481            scanned.license_expression.as_deref(),
482            Some("Apache-2.0 OR BSL-1.0")
483        );
484        assert!(
485            scanned.license_clues.is_empty(),
486            "license clues: {:#?}",
487            scanned.license_clues
488        );
489        assert_eq!(
490            scanned.license_detections.len(),
491            1,
492            "detections: {:#?}",
493            scanned.license_detections
494        );
495
496        let detection = &scanned.license_detections[0];
497        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
498
499        let match_expressions: Vec<_> = detection
500            .matches
501            .iter()
502            .map(|m| m.license_expression_spdx.as_str())
503            .collect();
504        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
505    }
506
507    #[test]
508    fn scanner_sets_generated_flag_when_enabled() {
509        let options = TextDetectionOptions {
510            collect_info: false,
511            detect_packages: false,
512            detect_application_packages: false,
513            detect_system_packages: false,
514            detect_packages_in_compiled: false,
515            detect_copyrights: false,
516            detect_generated: true,
517            detect_emails: false,
518            detect_urls: false,
519            max_emails: 50,
520            max_urls: 50,
521            timeout_seconds: 120.0,
522        };
523        let scanned = scan_single_file(
524            "generated.c",
525            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
526            &options,
527        );
528
529        assert_eq!(scanned.is_generated, Some(true));
530    }
531
532    #[test]
533    fn scanner_leaves_generated_flag_unset_when_disabled() {
534        let options = TextDetectionOptions {
535            collect_info: false,
536            detect_packages: false,
537            detect_application_packages: false,
538            detect_system_packages: false,
539            detect_packages_in_compiled: false,
540            detect_copyrights: false,
541            detect_generated: false,
542            detect_emails: false,
543            detect_urls: false,
544            max_emails: 50,
545            max_urls: 50,
546            timeout_seconds: 120.0,
547        };
548        let scanned = scan_single_file(
549            "generated.c",
550            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
551            &options,
552        );
553
554        assert_eq!(scanned.is_generated, None);
555    }
556
557    #[test]
558    fn scanner_populates_info_surface_when_enabled() {
559        let options = TextDetectionOptions {
560            collect_info: true,
561            detect_packages: false,
562            detect_application_packages: false,
563            detect_system_packages: false,
564            detect_packages_in_compiled: false,
565            detect_copyrights: false,
566            detect_generated: false,
567            detect_emails: false,
568            detect_urls: false,
569            max_emails: 50,
570            max_urls: 50,
571            timeout_seconds: 120.0,
572        };
573        let scanned = scan_single_file(
574            "script.py",
575            "#!/usr/bin/env python3\nprint(\"hello\")\n",
576            &options,
577        );
578
579        assert!(scanned.sha1.is_some());
580        assert!(scanned.md5.is_some());
581        assert!(scanned.sha256.is_some());
582        assert!(scanned.sha1_git.is_some());
583        assert!(scanned.mime_type.is_some());
584        assert!(scanned.date.is_some());
585        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
586        assert_eq!(scanned.is_text, Some(true));
587        assert_eq!(scanned.is_script, Some(true));
588        assert_eq!(scanned.is_source, Some(true));
589    }
590
591    #[test]
592    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
593        let options = TextDetectionOptions {
594            collect_info: true,
595            detect_packages: false,
596            detect_application_packages: false,
597            detect_system_packages: false,
598            detect_packages_in_compiled: false,
599            detect_copyrights: false,
600            detect_generated: false,
601            detect_emails: false,
602            detect_urls: false,
603            max_emails: 50,
604            max_urls: 50,
605            timeout_seconds: 120.0,
606        };
607        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
608        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
609
610        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
611        assert_eq!(
612            scanned.file_type_label.as_deref(),
613            Some("python script, text executable")
614        );
615        assert_eq!(scanned.is_binary, Some(false));
616        assert_eq!(scanned.is_text, Some(true));
617        assert_eq!(scanned.is_script, Some(true));
618        assert_eq!(scanned.is_source, Some(true));
619    }
620
621    #[test]
622    fn scanner_skips_findings_for_zip_like_archives() {
623        let options = TextDetectionOptions {
624            collect_info: true,
625            detect_packages: false,
626            detect_application_packages: false,
627            detect_system_packages: false,
628            detect_packages_in_compiled: false,
629            detect_copyrights: true,
630            detect_generated: false,
631            detect_emails: true,
632            detect_urls: true,
633            max_emails: 50,
634            max_urls: 50,
635            timeout_seconds: 120.0,
636        };
637        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
638        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
639
640        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
641        assert_eq!(scanned.is_archive, Some(true));
642        assert!(scanned.license_detections.is_empty());
643        assert!(scanned.copyrights.is_empty());
644        assert!(scanned.emails.is_empty());
645        assert!(scanned.urls.is_empty());
646    }
647
648    #[test]
649    fn scanner_treats_typescript_sources_as_text_not_video_media() {
650        let options = TextDetectionOptions {
651            collect_info: true,
652            detect_packages: false,
653            detect_application_packages: false,
654            detect_system_packages: false,
655            detect_packages_in_compiled: false,
656            detect_copyrights: false,
657            detect_generated: false,
658            detect_emails: false,
659            detect_urls: false,
660            max_emails: 50,
661            max_urls: 50,
662            timeout_seconds: 120.0,
663        };
664        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
665
666        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
667        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
668        assert_eq!(
669            scanned.file_type_label.as_deref(),
670            Some("UTF-8 Unicode text")
671        );
672        assert_eq!(scanned.is_text, Some(true));
673        assert_eq!(scanned.is_media, Some(false));
674        assert_eq!(scanned.is_script, Some(false));
675        assert_eq!(scanned.is_source, Some(true));
676    }
677
678    #[test]
679    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
680        let options = TextDetectionOptions {
681            collect_info: true,
682            detect_packages: false,
683            detect_application_packages: false,
684            detect_system_packages: false,
685            detect_packages_in_compiled: false,
686            detect_copyrights: false,
687            detect_generated: false,
688            detect_emails: false,
689            detect_urls: false,
690            max_emails: 50,
691            max_urls: 50,
692            timeout_seconds: 120.0,
693        };
694        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
695
696        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
697        assert_eq!(
698            scanned.file_type_label.as_deref(),
699            Some("UTF-8 Unicode text")
700        );
701        assert_eq!(scanned.is_text, Some(true));
702        assert_eq!(scanned.is_media, Some(false));
703        assert_eq!(scanned.is_script, Some(false));
704        assert_eq!(scanned.is_source, Some(true));
705    }
706
707    #[test]
708    fn scanner_treats_empty_files_like_scancode_info_surface() {
709        let options = TextDetectionOptions {
710            collect_info: true,
711            detect_packages: false,
712            detect_application_packages: false,
713            detect_system_packages: false,
714            detect_packages_in_compiled: false,
715            detect_copyrights: false,
716            detect_generated: false,
717            detect_emails: false,
718            detect_urls: false,
719            max_emails: 50,
720            max_urls: 50,
721            timeout_seconds: 120.0,
722        };
723        let scanned = scan_single_file("test.txt", "", &options);
724
725        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
726        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
727        assert_eq!(scanned.programming_language, None);
728        assert_eq!(scanned.is_binary, Some(false));
729        assert_eq!(scanned.is_text, Some(true));
730        assert_eq!(scanned.is_archive, Some(false));
731        assert_eq!(scanned.is_media, Some(false));
732        assert_eq!(scanned.is_source, Some(false));
733        assert_eq!(scanned.is_script, Some(false));
734    }
735
736    #[test]
737    fn scanner_treats_package_json_as_text_not_source() {
738        let options = TextDetectionOptions {
739            collect_info: true,
740            detect_packages: false,
741            detect_application_packages: false,
742            detect_system_packages: false,
743            detect_packages_in_compiled: false,
744            detect_copyrights: false,
745            detect_generated: false,
746            detect_emails: false,
747            detect_urls: false,
748            max_emails: 50,
749            max_urls: 50,
750            timeout_seconds: 120.0,
751        };
752        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
753
754        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
755        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
756        assert_eq!(scanned.programming_language, None);
757        assert_eq!(scanned.is_text, Some(true));
758        assert_eq!(scanned.is_source, Some(false));
759        assert_eq!(scanned.is_script, Some(false));
760    }
761
762    #[test]
763    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
764        let options = TextDetectionOptions {
765            collect_info: true,
766            detect_packages: false,
767            detect_application_packages: false,
768            detect_system_packages: false,
769            detect_packages_in_compiled: false,
770            detect_copyrights: false,
771            detect_generated: false,
772            detect_emails: false,
773            detect_urls: false,
774            max_emails: 50,
775            max_urls: 50,
776            timeout_seconds: 120.0,
777        };
778
779        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
780        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
781
782        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
783        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
784        assert_eq!(gradle.is_source, Some(true));
785        assert_eq!(gradle.is_script, Some(false));
786
787        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
788        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
789        assert_eq!(nix.is_source, Some(true));
790        assert_eq!(nix.is_script, Some(false));
791    }
792
793    #[test]
794    fn scanner_treats_gitmodules_as_text_not_source() {
795        let options = TextDetectionOptions {
796            collect_info: true,
797            detect_packages: false,
798            detect_application_packages: false,
799            detect_system_packages: false,
800            detect_packages_in_compiled: false,
801            detect_copyrights: false,
802            detect_generated: false,
803            detect_emails: false,
804            detect_urls: false,
805            max_emails: 50,
806            max_urls: 50,
807            timeout_seconds: 120.0,
808        };
809        let scanned = scan_file_at_relative_path(
810            ".gitmodules",
811            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
812            &options,
813        );
814
815        assert_eq!(scanned.programming_language, None);
816        assert_eq!(
817            scanned.file_type_label.as_deref(),
818            Some("Git configuration text")
819        );
820        assert_eq!(scanned.is_text, Some(true));
821        assert_eq!(scanned.is_source, Some(false));
822        assert_eq!(scanned.is_script, Some(false));
823    }
824
825    #[test]
826    fn scanner_treats_javascript_shebang_files_as_scripts() {
827        let options = TextDetectionOptions {
828            collect_info: true,
829            detect_packages: false,
830            detect_application_packages: false,
831            detect_system_packages: false,
832            detect_packages_in_compiled: false,
833            detect_copyrights: false,
834            detect_generated: false,
835            detect_emails: false,
836            detect_urls: false,
837            max_emails: 50,
838            max_urls: 50,
839            timeout_seconds: 120.0,
840        };
841        let scanned = scan_file_at_relative_path(
842            "bin/run",
843            b"#!/usr/bin/env node\nconsole.log('hello');\n",
844            &options,
845        );
846
847        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
848        assert_eq!(
849            scanned.file_type_label.as_deref(),
850            Some("javascript script, UTF-8 Unicode text executable")
851        );
852        assert_eq!(scanned.is_script, Some(true));
853        assert_eq!(scanned.is_source, Some(true));
854    }
855
856    #[test]
857    fn scanner_treats_dockerfile_as_source() {
858        let options = TextDetectionOptions {
859            collect_info: true,
860            detect_packages: false,
861            detect_application_packages: false,
862            detect_system_packages: false,
863            detect_packages_in_compiled: false,
864            detect_copyrights: false,
865            detect_generated: false,
866            detect_emails: false,
867            detect_urls: false,
868            max_emails: 50,
869            max_urls: 50,
870            timeout_seconds: 120.0,
871        };
872        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
873
874        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
875        assert_eq!(
876            scanned.file_type_label.as_deref(),
877            Some("UTF-8 Unicode text")
878        );
879        assert_eq!(scanned.is_source, Some(true));
880        assert_eq!(scanned.is_script, Some(false));
881    }
882
883    #[test]
884    fn scanner_treats_makefile_as_text_not_source() {
885        let options = TextDetectionOptions {
886            collect_info: true,
887            detect_packages: false,
888            detect_application_packages: false,
889            detect_system_packages: false,
890            detect_packages_in_compiled: false,
891            detect_copyrights: false,
892            detect_generated: false,
893            detect_emails: false,
894            detect_urls: false,
895            max_emails: 50,
896            max_urls: 50,
897            timeout_seconds: 120.0,
898        };
899        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
900
901        assert_eq!(scanned.programming_language, None);
902        assert_eq!(
903            scanned.file_type_label.as_deref(),
904            Some("UTF-8 Unicode text")
905        );
906        assert_eq!(scanned.is_text, Some(true));
907        assert_eq!(scanned.is_source, Some(false));
908        assert_eq!(scanned.is_script, Some(false));
909    }
910
911    #[test]
912    fn scanner_omits_info_surface_when_disabled() {
913        let options = TextDetectionOptions {
914            collect_info: false,
915            detect_packages: false,
916            detect_application_packages: false,
917            detect_system_packages: false,
918            detect_packages_in_compiled: false,
919            detect_copyrights: false,
920            detect_generated: false,
921            detect_emails: false,
922            detect_urls: false,
923            max_emails: 50,
924            max_urls: 50,
925            timeout_seconds: 120.0,
926        };
927        let scanned = scan_single_file(
928            "script.py",
929            "#!/usr/bin/env python3\nprint(\"hello\")\n",
930            &options,
931        );
932
933        assert!(scanned.sha1.is_none());
934        assert!(scanned.md5.is_none());
935        assert!(scanned.sha256.is_none());
936        assert!(scanned.sha1_git.is_none());
937        assert!(scanned.mime_type.is_none());
938        assert!(scanned.date.is_none());
939        assert!(scanned.programming_language.is_none());
940        assert!(scanned.is_binary.is_none());
941        assert!(scanned.is_text.is_none());
942        assert!(scanned.is_archive.is_none());
943        assert!(scanned.is_media.is_none());
944        assert!(scanned.is_script.is_none());
945        assert!(scanned.is_source.is_none());
946    }
947
948    #[test]
949    fn scanner_skips_package_parsing_when_disabled() {
950        let options = TextDetectionOptions {
951            collect_info: false,
952            detect_packages: false,
953            detect_application_packages: false,
954            detect_system_packages: false,
955            detect_packages_in_compiled: false,
956            detect_copyrights: false,
957            detect_generated: false,
958            detect_emails: false,
959            detect_urls: false,
960            max_emails: 50,
961            max_urls: 50,
962            timeout_seconds: 120.0,
963        };
964        let scanned = scan_single_file(
965            "package.json",
966            r#"{"name":"demo","version":"1.0.0"}"#,
967            &options,
968        );
969
970        assert!(
971            scanned.package_data.is_empty(),
972            "package_data: {:#?}",
973            scanned.package_data
974        );
975    }
976
977    #[test]
978    fn scanner_parses_package_manifests_when_enabled() {
979        let options = TextDetectionOptions {
980            collect_info: false,
981            detect_packages: true,
982            detect_application_packages: true,
983            detect_system_packages: false,
984            detect_packages_in_compiled: false,
985            detect_copyrights: false,
986            detect_generated: false,
987            detect_emails: false,
988            detect_urls: false,
989            max_emails: 50,
990            max_urls: 50,
991            timeout_seconds: 120.0,
992        };
993        let scanned = scan_single_file(
994            "package.json",
995            r#"{"name":"demo","version":"1.0.0"}"#,
996            &options,
997        );
998
999        assert_eq!(
1000            scanned.package_data.len(),
1001            1,
1002            "package_data: {:#?}",
1003            scanned.package_data
1004        );
1005    }
1006
1007    #[test]
1008    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1009        let options = TextDetectionOptions {
1010            collect_info: false,
1011            detect_packages: true,
1012            detect_application_packages: false,
1013            detect_system_packages: true,
1014            detect_packages_in_compiled: false,
1015            detect_copyrights: false,
1016            detect_generated: false,
1017            detect_emails: false,
1018            detect_urls: false,
1019            max_emails: 50,
1020            max_urls: 50,
1021            timeout_seconds: 120.0,
1022        };
1023        let scanned = scan_single_file(
1024            "package.json",
1025            r#"{"name":"demo","version":"1.0.0"}"#,
1026            &options,
1027        );
1028
1029        assert!(
1030            scanned.package_data.is_empty(),
1031            "package_data: {:#?}",
1032            scanned.package_data
1033        );
1034    }
1035
1036    #[test]
1037    fn scanner_parses_system_package_files_when_enabled() {
1038        let options = TextDetectionOptions {
1039            collect_info: false,
1040            detect_packages: true,
1041            detect_application_packages: false,
1042            detect_system_packages: true,
1043            detect_packages_in_compiled: false,
1044            detect_copyrights: false,
1045            detect_generated: false,
1046            detect_emails: false,
1047            detect_urls: false,
1048            max_emails: 50,
1049            max_urls: 50,
1050            timeout_seconds: 120.0,
1051        };
1052        let scanned = scan_file_at_relative_path(
1053            "var/lib/dpkg/status",
1054            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1055            &options,
1056        );
1057
1058        assert!(
1059            !scanned.package_data.is_empty(),
1060            "package_data: {:#?}",
1061            scanned.package_data
1062        );
1063    }
1064
1065    #[test]
1066    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1067        if std::process::Command::new("go")
1068            .arg("version")
1069            .status()
1070            .is_err()
1071        {
1072            return;
1073        }
1074
1075        let temp_dir = TempDir::new().expect("create temp dir");
1076        fs::write(
1077            temp_dir.path().join("go.mod"),
1078            "module example.com/demo\n\ngo 1.23.0\n",
1079        )
1080        .expect("write go.mod");
1081        fs::write(
1082            temp_dir.path().join("main.go"),
1083            "package main\nfunc main() {}\n",
1084        )
1085        .expect("write main.go");
1086        let file_path = temp_dir.path().join("demo");
1087        let status = std::process::Command::new("go")
1088            .current_dir(temp_dir.path())
1089            .args(["build", "-o"])
1090            .arg(&file_path)
1091            .status()
1092            .expect("run go build");
1093        assert!(status.success());
1094
1095        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1096        let collected = collect_paths(temp_dir.path(), 0, &[]);
1097
1098        let without_compiled = process_collected(
1099            &collected,
1100            Arc::clone(&progress),
1101            None,
1102            LicenseScanOptions::default(),
1103            &TextDetectionOptions {
1104                collect_info: false,
1105                detect_packages: true,
1106                detect_application_packages: true,
1107                detect_system_packages: false,
1108                detect_packages_in_compiled: false,
1109                detect_copyrights: false,
1110                detect_generated: false,
1111                detect_emails: false,
1112                detect_urls: false,
1113                max_emails: 50,
1114                max_urls: 50,
1115                timeout_seconds: 120.0,
1116            },
1117        );
1118        let with_compiled = process_collected(
1119            &collected,
1120            progress,
1121            None,
1122            LicenseScanOptions::default(),
1123            &TextDetectionOptions {
1124                collect_info: false,
1125                detect_packages: true,
1126                detect_application_packages: true,
1127                detect_system_packages: false,
1128                detect_packages_in_compiled: true,
1129                detect_copyrights: false,
1130                detect_generated: false,
1131                detect_emails: false,
1132                detect_urls: false,
1133                max_emails: 50,
1134                max_urls: 50,
1135                timeout_seconds: 120.0,
1136            },
1137        );
1138
1139        let without_compiled = without_compiled
1140            .files
1141            .into_iter()
1142            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1143            .expect("compiled artifact present");
1144        let with_compiled = with_compiled
1145            .files
1146            .into_iter()
1147            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1148            .expect("compiled artifact present");
1149
1150        assert!(
1151            without_compiled.package_data.is_empty(),
1152            "package_data: {:#?}",
1153            without_compiled.package_data
1154        );
1155        assert!(!with_compiled.package_data.is_empty());
1156    }
1157
1158    #[test]
1159    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1160        let temp_dir = TempDir::new().expect("create temp dir");
1161        let file_path = temp_dir.path().join("libiconv2.dll");
1162        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1163            .expect("read PE fixture");
1164        fs::write(&file_path, fixture).expect("write PE fixture");
1165
1166        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1167        let collected = collect_paths(temp_dir.path(), 0, &[]);
1168
1169        let without_package = process_collected(
1170            &collected,
1171            Arc::clone(&progress),
1172            None,
1173            LicenseScanOptions::default(),
1174            &TextDetectionOptions {
1175                collect_info: false,
1176                detect_packages: false,
1177                detect_application_packages: false,
1178                detect_system_packages: false,
1179                detect_packages_in_compiled: false,
1180                detect_copyrights: false,
1181                detect_generated: false,
1182                detect_emails: false,
1183                detect_urls: false,
1184                max_emails: 50,
1185                max_urls: 50,
1186                timeout_seconds: 120.0,
1187            },
1188        );
1189        let with_package = process_collected(
1190            &collected,
1191            progress,
1192            None,
1193            LicenseScanOptions::default(),
1194            &TextDetectionOptions {
1195                collect_info: false,
1196                detect_packages: true,
1197                detect_application_packages: true,
1198                detect_system_packages: false,
1199                detect_packages_in_compiled: false,
1200                detect_copyrights: false,
1201                detect_generated: false,
1202                detect_emails: false,
1203                detect_urls: false,
1204                max_emails: 50,
1205                max_urls: 50,
1206                timeout_seconds: 120.0,
1207            },
1208        );
1209
1210        let without_package = without_package
1211            .files
1212            .into_iter()
1213            .find(|entry| {
1214                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1215            })
1216            .expect("compiled artifact present");
1217        let with_package = with_package
1218            .files
1219            .into_iter()
1220            .find(|entry| {
1221                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1222            })
1223            .expect("compiled artifact present");
1224
1225        assert!(without_package.package_data.is_empty());
1226        assert_eq!(with_package.package_data.len(), 1);
1227        assert_eq!(
1228            with_package.package_data[0].package_type,
1229            Some(FilePackageType::Winexe)
1230        );
1231        assert_eq!(
1232            with_package.package_data[0].datasource_id,
1233            Some(DatasourceId::WindowsExecutable)
1234        );
1235    }
1236
1237    #[test]
1238    fn scanner_detects_license_from_font_metadata() {
1239        let temp_dir = TempDir::new().expect("create temp dir");
1240        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1241        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1242        fs::write(&file_path, fixture).expect("write font fixture");
1243
1244        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1245        let collected = collect_paths(temp_dir.path(), 0, &[]);
1246        let engine =
1247            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1248        let result = process_collected(
1249            &collected,
1250            progress,
1251            Some(engine),
1252            LicenseScanOptions::default(),
1253            &TextDetectionOptions::default(),
1254        );
1255        let scanned = result
1256            .files
1257            .into_iter()
1258            .find(|entry| {
1259                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1260            })
1261            .expect("scanned file entry");
1262
1263        assert!(
1264            scanned.license_expression.is_some(),
1265            "license detections: {:#?}",
1266            scanned.license_detections
1267        );
1268        assert!(
1269            scanned
1270                .license_expression
1271                .as_deref()
1272                .is_some_and(
1273                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1274                ),
1275            "license expression: {:?}",
1276            scanned.license_expression
1277        );
1278    }
1279
1280    #[test]
1281    fn scanner_detects_license_from_windows_executable_metadata() {
1282        let temp_dir = TempDir::new().expect("create temp dir");
1283        let file_path = temp_dir.path().join("libiconv2.dll");
1284        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1285            .expect("read PE fixture");
1286        fs::write(&file_path, fixture).expect("write PE fixture");
1287
1288        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1289        let collected = collect_paths(temp_dir.path(), 0, &[]);
1290        let engine =
1291            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1292        let result = process_collected(
1293            &collected,
1294            progress,
1295            Some(engine),
1296            LicenseScanOptions::default(),
1297            &TextDetectionOptions::default(),
1298        );
1299        let scanned = result
1300            .files
1301            .into_iter()
1302            .find(|entry| {
1303                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1304            })
1305            .expect("scanned file entry");
1306
1307        assert!(
1308            scanned.license_expression.is_some(),
1309            "license detections: {:#?}",
1310            scanned.license_detections
1311        );
1312        assert!(
1313            scanned
1314                .license_expression
1315                .as_deref()
1316                .is_some_and(|expression| {
1317                    expression.contains("lgpl") || expression.contains("LGPL")
1318                }),
1319            "license expression: {:?}",
1320            scanned.license_expression
1321        );
1322    }
1323
1324    #[test]
1325    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1326        let scanned = scan_single_file_with_license_engine(
1327            "navbar.md",
1328            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1329            &TextDetectionOptions::default(),
1330        );
1331
1332        assert!(
1333            scanned
1334                .license_expression
1335                .as_deref()
1336                .is_some_and(|expression| {
1337                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1338                }),
1339            "license expression: {:?}",
1340            scanned.license_expression
1341        );
1342    }
1343
1344    #[test]
1345    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1346        let scanned = scan_single_file_with_license_engine(
1347            "README.md",
1348            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1349            &TextDetectionOptions::default(),
1350        );
1351
1352        assert!(
1353            scanned
1354                .license_expression
1355                .as_deref()
1356                .is_some_and(|expression| {
1357                    expression.contains("mit") || expression.contains("MIT")
1358                }),
1359            "license expression: {:?}",
1360            scanned.license_expression
1361        );
1362    }
1363
1364    #[test]
1365    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1366        let scanned = scan_single_file_with_license_engine(
1367            "README.md",
1368            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1369            &TextDetectionOptions::default(),
1370        );
1371
1372        assert!(
1373            scanned
1374                .license_expression
1375                .as_deref()
1376                .is_some_and(|expression| {
1377                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1378                }),
1379            "license expression: {:?}",
1380            scanned.license_expression
1381        );
1382    }
1383
1384    #[test]
1385    fn scanner_sets_is_source_only_when_info_enabled() {
1386        let without_info = TextDetectionOptions {
1387            collect_info: false,
1388            detect_packages: false,
1389            detect_application_packages: false,
1390            detect_system_packages: false,
1391            detect_packages_in_compiled: false,
1392            detect_copyrights: false,
1393            detect_generated: false,
1394            detect_emails: false,
1395            detect_urls: false,
1396            max_emails: 50,
1397            max_urls: 50,
1398            timeout_seconds: 120.0,
1399        };
1400        let with_info = TextDetectionOptions {
1401            collect_info: true,
1402            ..without_info.clone()
1403        };
1404
1405        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1406        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1407
1408        assert_eq!(scanned_without_info.is_source, None);
1409        assert_eq!(scanned_with_info.is_source, Some(true));
1410    }
1411
1412    #[test]
1413    fn directory_omits_info_fields_when_info_disabled() {
1414        let temp_dir = TempDir::new().expect("create temp dir");
1415        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1416
1417        let collected = collect_paths(temp_dir.path(), 0, &[]);
1418        let result = process_collected(
1419            &collected,
1420            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1421            None,
1422            LicenseScanOptions::default(),
1423            &TextDetectionOptions {
1424                collect_info: false,
1425                detect_packages: false,
1426                detect_application_packages: false,
1427                detect_system_packages: false,
1428                detect_packages_in_compiled: false,
1429                detect_copyrights: false,
1430                detect_generated: false,
1431                detect_emails: false,
1432                detect_urls: false,
1433                max_emails: 50,
1434                max_urls: 50,
1435                timeout_seconds: 120.0,
1436            },
1437        );
1438
1439        let directory = result
1440            .files
1441            .into_iter()
1442            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1443            .expect("directory entry");
1444
1445        assert!(directory.date.is_none());
1446        assert!(directory.file_type_label.is_none());
1447        assert!(directory.is_binary.is_none());
1448        assert!(directory.is_text.is_none());
1449        assert!(directory.is_archive.is_none());
1450        assert!(directory.is_media.is_none());
1451        assert!(directory.is_source.is_none());
1452        assert!(directory.is_script.is_none());
1453    }
1454
1455    #[test]
1456    fn directory_includes_info_fields_when_info_enabled() {
1457        let temp_dir = TempDir::new().expect("create temp dir");
1458        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1459
1460        let collected = collect_paths(temp_dir.path(), 0, &[]);
1461        let result = process_collected(
1462            &collected,
1463            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1464            None,
1465            LicenseScanOptions::default(),
1466            &TextDetectionOptions {
1467                collect_info: true,
1468                detect_packages: false,
1469                detect_application_packages: false,
1470                detect_system_packages: false,
1471                detect_packages_in_compiled: false,
1472                detect_copyrights: false,
1473                detect_generated: false,
1474                detect_emails: false,
1475                detect_urls: false,
1476                max_emails: 50,
1477                max_urls: 50,
1478                timeout_seconds: 120.0,
1479            },
1480        );
1481
1482        let directory = result
1483            .files
1484            .into_iter()
1485            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1486            .expect("directory entry");
1487
1488        assert!(directory.date.is_none());
1489        assert!(directory.file_type_label.is_none());
1490        assert_eq!(directory.is_binary, Some(false));
1491        assert_eq!(directory.is_text, Some(false));
1492        assert_eq!(directory.is_archive, Some(false));
1493        assert_eq!(directory.is_media, Some(false));
1494        assert_eq!(directory.is_source, Some(false));
1495        assert_eq!(directory.is_script, Some(false));
1496        assert_eq!(directory.files_count, Some(0));
1497        assert_eq!(directory.dirs_count, Some(0));
1498        assert_eq!(directory.size_count, Some(0));
1499    }
1500
1501    #[test]
1502    fn collect_paths_includes_root_directory_entry() {
1503        let temp_dir = TempDir::new().expect("create temp dir");
1504        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1505        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1506            .expect("write nested file");
1507
1508        let collected = collect_paths(temp_dir.path(), 0, &[]);
1509
1510        assert!(
1511            collected
1512                .directories
1513                .iter()
1514                .any(|(path, _)| path == temp_dir.path())
1515        );
1516    }
1517
1518    #[test]
1519    fn collect_paths_supports_single_file_input() {
1520        let temp_dir = TempDir::new().expect("create temp dir");
1521        let file_path = temp_dir.path().join("main.rs");
1522        fs::write(&file_path, "fn main() {}\n").expect("write file");
1523
1524        let collected = collect_paths(&file_path, 0, &[]);
1525
1526        assert_eq!(collected.files.len(), 1);
1527        assert!(collected.directories.is_empty());
1528        assert_eq!(collected.files[0].0, file_path);
1529    }
1530
1531    #[test]
1532    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1533        let temp_dir = TempDir::new().expect("create temp dir");
1534        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1535        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1536
1537        let collected = collect_paths(temp_dir.path(), 0, &[]);
1538        let result = process_collected_with_memory_limit(
1539            &collected,
1540            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1541            None,
1542            LicenseScanOptions::default(),
1543            &TextDetectionOptions {
1544                collect_info: false,
1545                detect_packages: false,
1546                detect_application_packages: false,
1547                detect_system_packages: false,
1548                detect_packages_in_compiled: false,
1549                detect_copyrights: false,
1550                detect_generated: false,
1551                detect_emails: false,
1552                detect_urls: false,
1553                max_emails: 50,
1554                max_urls: 50,
1555                timeout_seconds: 120.0,
1556            },
1557            MemoryMode::Limit(1),
1558        );
1559
1560        assert_eq!(result.files.len(), 3);
1561    }
1562
1563    #[test]
1564    fn process_collected_with_negative_one_uses_disk_only_mode() {
1565        let temp_dir = TempDir::new().expect("create temp dir");
1566        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1567
1568        let collected = collect_paths(temp_dir.path(), 0, &[]);
1569        let result = process_collected_with_memory_limit(
1570            &collected,
1571            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1572            None,
1573            LicenseScanOptions::default(),
1574            &TextDetectionOptions {
1575                collect_info: false,
1576                detect_packages: false,
1577                detect_application_packages: false,
1578                detect_system_packages: false,
1579                detect_packages_in_compiled: false,
1580                detect_copyrights: false,
1581                detect_generated: false,
1582                detect_emails: false,
1583                detect_urls: false,
1584                max_emails: 50,
1585                max_urls: 50,
1586                timeout_seconds: 120.0,
1587            },
1588            MemoryMode::StreamUnlimited,
1589        );
1590
1591        assert_eq!(result.files.len(), 2);
1592    }
1593}