Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use crate::license_detection::LicenseDetectionEngine;
5use crate::models::FileInfo;
6
7pub struct ProcessResult {
8    pub files: Vec<FileInfo>,
9    pub excluded_count: usize,
10}
11
12#[derive(Debug, Clone, Copy, Default)]
13pub struct LicenseScanOptions {
14    pub include_text: bool,
15    pub include_text_diagnostics: bool,
16    pub include_diagnostics: bool,
17    pub unknown_licenses: bool,
18    pub min_score: u8,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23    pub collect_info: bool,
24    pub detect_packages: bool,
25    pub detect_application_packages: bool,
26    pub detect_system_packages: bool,
27    pub detect_packages_in_compiled: bool,
28    pub detect_copyrights: bool,
29    pub detect_generated: bool,
30    pub detect_emails: bool,
31    pub detect_urls: bool,
32    pub max_emails: usize,
33    pub max_urls: usize,
34    pub timeout_seconds: f64,
35}
36
37impl Default for TextDetectionOptions {
38    fn default() -> Self {
39        Self {
40            collect_info: false,
41            detect_packages: false,
42            detect_application_packages: false,
43            detect_system_packages: false,
44            detect_packages_in_compiled: false,
45            detect_copyrights: true,
46            detect_generated: false,
47            detect_emails: false,
48            detect_urls: false,
49            max_emails: 50,
50            max_urls: 50,
51            timeout_seconds: 120.0,
52        }
53    }
54}
55
56pub fn scan_options_fingerprint(
57    text_options: &TextDetectionOptions,
58    license_options: LicenseScanOptions,
59    license_engine: Option<&LicenseDetectionEngine>,
60) -> String {
61    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
62        Some(engine) => {
63            let rules = &engine.index().rules_by_rid;
64            (
65                true,
66                rules.len(),
67                rules
68                    .first()
69                    .map(|rule| rule.identifier.as_str())
70                    .unwrap_or(""),
71                rules
72                    .last()
73                    .map(|rule| rule.identifier.as_str())
74                    .unwrap_or(""),
75            )
76        }
77        None => (false, 0, "", ""),
78    };
79
80    format!(
81        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
82        env!("CARGO_PKG_VERSION"),
83        text_options.collect_info,
84        text_options.detect_packages,
85        text_options.detect_application_packages,
86        text_options.detect_system_packages,
87        text_options.detect_packages_in_compiled,
88        text_options.detect_copyrights,
89        text_options.detect_generated,
90        text_options.detect_emails,
91        text_options.detect_urls,
92        text_options.max_emails,
93        text_options.max_urls,
94        text_options.timeout_seconds,
95        license_enabled,
96        rules_count,
97        first_rule_id,
98        last_rule_id,
99        license_options.include_text,
100        license_options.include_text_diagnostics,
101        license_options.include_diagnostics,
102        license_options.unknown_licenses,
103        license_options.min_score,
104    )
105}
106
107pub use self::collect::{CollectedPaths, collect_paths};
108#[allow(unused_imports)]
109pub use self::process::{process_collected, process_collected_with_memory_limit};
110
111#[cfg(test)]
112mod tests {
113    use std::fs;
114    use std::sync::Arc;
115
116    use tempfile::TempDir;
117
118    use crate::license_detection::LicenseDetectionEngine;
119    use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
120    use crate::progress::{ProgressMode, ScanProgress};
121
122    use super::{
123        LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected,
124        process_collected_with_memory_limit,
125    };
126
127    #[test]
128    fn default_options_keep_copyright_detection_enabled() {
129        let options = TextDetectionOptions::default();
130        assert!(!options.detect_packages);
131        assert!(options.detect_copyrights);
132    }
133
134    fn scan_single_file(
135        file_name: &str,
136        content: &str,
137        options: &TextDetectionOptions,
138    ) -> crate::models::FileInfo {
139        let temp_dir = TempDir::new().expect("create temp dir");
140        let file_path = temp_dir.path().join(file_name);
141        fs::write(&file_path, content).expect("write test file");
142
143        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
144        let collected = collect_paths(temp_dir.path(), 0, &[]);
145        let result = process_collected(
146            &collected,
147            progress,
148            None,
149            LicenseScanOptions::default(),
150            options,
151        );
152
153        result
154            .files
155            .into_iter()
156            .find(|entry| {
157                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
158            })
159            .expect("scanned file entry")
160    }
161
162    fn scan_file_at_relative_path(
163        relative_path: &str,
164        content: &[u8],
165        options: &TextDetectionOptions,
166    ) -> crate::models::FileInfo {
167        let temp_dir = TempDir::new().expect("create temp dir");
168        let file_path = temp_dir.path().join(relative_path);
169        if let Some(parent) = file_path.parent() {
170            fs::create_dir_all(parent).expect("create parent dirs");
171        }
172        fs::write(&file_path, content).expect("write test file");
173
174        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
175        let collected = collect_paths(temp_dir.path(), 0, &[]);
176        let result = process_collected(
177            &collected,
178            progress,
179            None,
180            LicenseScanOptions::default(),
181            options,
182        );
183
184        result
185            .files
186            .into_iter()
187            .find(|entry| {
188                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
189            })
190            .expect("scanned file entry")
191    }
192
193    fn scan_single_file_with_license_engine(
194        file_name: &str,
195        content: &str,
196        options: &TextDetectionOptions,
197    ) -> crate::models::FileInfo {
198        let temp_dir = TempDir::new().expect("create temp dir");
199        let file_path = temp_dir.path().join(file_name);
200        fs::write(&file_path, content).expect("write test file");
201
202        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
203        let collected = collect_paths(temp_dir.path(), 0, &[]);
204        let engine =
205            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
206        let result = process_collected(
207            &collected,
208            progress,
209            Some(engine),
210            LicenseScanOptions::default(),
211            options,
212        );
213
214        result
215            .files
216            .into_iter()
217            .find(|entry| {
218                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
219            })
220            .expect("scanned file entry")
221    }
222
223    #[test]
224    fn scanner_reports_repeated_email_occurrences() {
225        let options = TextDetectionOptions {
226            collect_info: false,
227            detect_packages: false,
228            detect_application_packages: false,
229            detect_system_packages: false,
230            detect_packages_in_compiled: false,
231            detect_copyrights: false,
232            detect_generated: false,
233            detect_emails: true,
234            detect_urls: false,
235            max_emails: 50,
236            max_urls: 50,
237            timeout_seconds: 120.0,
238        };
239        let scanned = scan_single_file(
240            "contacts.txt",
241            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
242            &options,
243        );
244
245        let emails: Vec<(&str, usize)> = scanned
246            .emails
247            .iter()
248            .map(|email| (email.email.as_str(), email.start_line))
249            .collect();
250
251        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
252        assert_eq!(
253            emails,
254            vec![
255                ("linux@3ware.com", 1),
256                ("linux@3ware.com", 2),
257                ("andre@suse.com", 3),
258                ("linux@3ware.com", 4),
259            ]
260        );
261    }
262
263    #[test]
264    fn scanner_skips_pem_certificate_text_detection() {
265        let options = TextDetectionOptions {
266            collect_info: false,
267            detect_packages: false,
268            detect_application_packages: false,
269            detect_system_packages: false,
270            detect_packages_in_compiled: false,
271            detect_copyrights: true,
272            detect_generated: false,
273            detect_emails: true,
274            detect_urls: true,
275            max_emails: 50,
276            max_urls: 50,
277            timeout_seconds: 120.0,
278        };
279        let pem_fixture = concat!(
280            "-----BEGIN CERTIFICATE-----\n",
281            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
282            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
283            "-----END CERTIFICATE-----\n",
284            "Certificate:\n",
285            "    Data:\n",
286            "        Signature Algorithm: sha1WithRSAEncryption\n",
287            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
288            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
289            "        Contact: cert-owner@example.com\n",
290        );
291        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
292
293        assert!(
294            scanned.copyrights.is_empty(),
295            "copyrights: {:#?}",
296            scanned.copyrights
297        );
298        assert!(
299            scanned.holders.is_empty(),
300            "holders: {:#?}",
301            scanned.holders
302        );
303        assert!(
304            scanned.authors.is_empty(),
305            "authors: {:#?}",
306            scanned.authors
307        );
308        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
309        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
310        assert!(
311            scanned.license_detections.is_empty(),
312            "licenses: {:#?}",
313            scanned.license_detections
314        );
315        assert!(
316            scanned.license_clues.is_empty(),
317            "license clues: {:#?}",
318            scanned.license_clues
319        );
320    }
321
322    #[test]
323    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
324        let options = TextDetectionOptions {
325            collect_info: false,
326            detect_packages: false,
327            detect_application_packages: false,
328            detect_system_packages: false,
329            detect_packages_in_compiled: false,
330            detect_copyrights: true,
331            detect_generated: false,
332            detect_emails: false,
333            detect_urls: true,
334            max_emails: 50,
335            max_urls: 50,
336            timeout_seconds: 120.0,
337        };
338        let fixture = concat!(
339            "/*\n",
340            "Copyright 2022 The Kubernetes Authors.\n\n",
341            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
342            "you may not use this file except in compliance with the License.\n",
343            "You may obtain a copy of the License at\n\n",
344            "    http://www.apache.org/licenses/LICENSE-2.0\n",
345            "*/\n\n",
346            "package storage\n\n",
347            "const validCert = `\n",
348            "-----BEGIN CERTIFICATE-----\n",
349            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
350            "-----END CERTIFICATE-----\n",
351            "`\n",
352        );
353        let temp_dir = TempDir::new().expect("create temp dir");
354        let file_path = temp_dir.path().join("storage_test.go");
355        fs::write(&file_path, fixture).expect("write fixture");
356
357        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
358        let collected = collect_paths(temp_dir.path(), 0, &[]);
359        let engine =
360            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
361        let result = process_collected(
362            &collected,
363            progress,
364            Some(engine),
365            LicenseScanOptions::default(),
366            &options,
367        );
368        let scanned = result
369            .files
370            .into_iter()
371            .find(|entry| {
372                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
373            })
374            .expect("scanned file entry");
375
376        assert!(
377            scanned
378                .copyrights
379                .iter()
380                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
381            "copyrights: {:#?}",
382            scanned.copyrights
383        );
384        assert!(
385            scanned
386                .holders
387                .iter()
388                .any(|h| h.holder == "The Kubernetes Authors"),
389            "holders: {:#?}",
390            scanned.holders
391        );
392        assert!(
393            scanned
394                .urls
395                .iter()
396                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
397            "urls: {:#?}",
398            scanned.urls
399        );
400        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
401    }
402
403    #[test]
404    fn scanner_detects_structured_credits_authors() {
405        let options = TextDetectionOptions {
406            collect_info: false,
407            detect_packages: false,
408            detect_application_packages: false,
409            detect_system_packages: false,
410            detect_packages_in_compiled: false,
411            detect_copyrights: true,
412            detect_generated: false,
413            detect_emails: false,
414            detect_urls: false,
415            max_emails: 50,
416            max_urls: 50,
417            timeout_seconds: 120.0,
418        };
419        let credits_fixture = concat!(
420            "N: Jack Lloyd\n",
421            "E: lloyd@randombit.net\n",
422            "W: http://www.randombit.net/\n",
423        );
424        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
425
426        let authors: Vec<(&str, usize, usize)> = scanned
427            .authors
428            .iter()
429            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
430            .collect();
431
432        assert_eq!(
433            authors,
434            vec![(
435                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
436                1,
437                3,
438            )]
439        );
440        assert!(scanned.copyrights.is_empty());
441        assert!(scanned.holders.is_empty());
442    }
443
444    #[test]
445    fn scanner_uses_or_for_alternative_license_header() {
446        let fixture =
447            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
448        let temp_dir = TempDir::new().expect("create temp dir");
449        let file_path = temp_dir.path().join("d2s.ipp");
450        fs::write(&file_path, fixture).expect("write fixture");
451
452        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
453        let collected = collect_paths(temp_dir.path(), 0, &[]);
454        let engine =
455            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
456        let result = process_collected(
457            &collected,
458            progress,
459            Some(engine),
460            LicenseScanOptions::default(),
461            &TextDetectionOptions::default(),
462        );
463        let scanned = result
464            .files
465            .into_iter()
466            .find(|entry| {
467                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
468            })
469            .expect("scanned file entry");
470
471        assert_eq!(
472            scanned.license_expression.as_deref(),
473            Some("Apache-2.0 OR BSL-1.0")
474        );
475        assert!(
476            scanned.license_clues.is_empty(),
477            "license clues: {:#?}",
478            scanned.license_clues
479        );
480        assert_eq!(
481            scanned.license_detections.len(),
482            1,
483            "detections: {:#?}",
484            scanned.license_detections
485        );
486
487        let detection = &scanned.license_detections[0];
488        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
489
490        let match_expressions: Vec<_> = detection
491            .matches
492            .iter()
493            .map(|m| m.license_expression_spdx.as_str())
494            .collect();
495        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
496    }
497
498    #[test]
499    fn scanner_sets_generated_flag_when_enabled() {
500        let options = TextDetectionOptions {
501            collect_info: false,
502            detect_packages: false,
503            detect_application_packages: false,
504            detect_system_packages: false,
505            detect_packages_in_compiled: false,
506            detect_copyrights: false,
507            detect_generated: true,
508            detect_emails: false,
509            detect_urls: false,
510            max_emails: 50,
511            max_urls: 50,
512            timeout_seconds: 120.0,
513        };
514        let scanned = scan_single_file(
515            "generated.c",
516            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
517            &options,
518        );
519
520        assert_eq!(scanned.is_generated, Some(true));
521    }
522
523    #[test]
524    fn scanner_leaves_generated_flag_unset_when_disabled() {
525        let options = TextDetectionOptions {
526            collect_info: false,
527            detect_packages: false,
528            detect_application_packages: false,
529            detect_system_packages: false,
530            detect_packages_in_compiled: false,
531            detect_copyrights: false,
532            detect_generated: false,
533            detect_emails: false,
534            detect_urls: false,
535            max_emails: 50,
536            max_urls: 50,
537            timeout_seconds: 120.0,
538        };
539        let scanned = scan_single_file(
540            "generated.c",
541            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
542            &options,
543        );
544
545        assert_eq!(scanned.is_generated, None);
546    }
547
548    #[test]
549    fn scanner_populates_info_surface_when_enabled() {
550        let options = TextDetectionOptions {
551            collect_info: true,
552            detect_packages: false,
553            detect_application_packages: false,
554            detect_system_packages: false,
555            detect_packages_in_compiled: false,
556            detect_copyrights: false,
557            detect_generated: false,
558            detect_emails: false,
559            detect_urls: false,
560            max_emails: 50,
561            max_urls: 50,
562            timeout_seconds: 120.0,
563        };
564        let scanned = scan_single_file(
565            "script.py",
566            "#!/usr/bin/env python3\nprint(\"hello\")\n",
567            &options,
568        );
569
570        assert!(scanned.sha1.is_some());
571        assert!(scanned.md5.is_some());
572        assert!(scanned.sha256.is_some());
573        assert!(scanned.sha1_git.is_some());
574        assert!(scanned.mime_type.is_some());
575        assert!(scanned.date.is_some());
576        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
577        assert_eq!(scanned.is_text, Some(true));
578        assert_eq!(scanned.is_script, Some(true));
579        assert_eq!(scanned.is_source, Some(true));
580    }
581
582    #[test]
583    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
584        let options = TextDetectionOptions {
585            collect_info: true,
586            detect_packages: false,
587            detect_application_packages: false,
588            detect_system_packages: false,
589            detect_packages_in_compiled: false,
590            detect_copyrights: false,
591            detect_generated: false,
592            detect_emails: false,
593            detect_urls: false,
594            max_emails: 50,
595            max_urls: 50,
596            timeout_seconds: 120.0,
597        };
598        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
599        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
600
601        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
602        assert_eq!(
603            scanned.file_type_label.as_deref(),
604            Some("python script, text executable")
605        );
606        assert_eq!(scanned.is_binary, Some(false));
607        assert_eq!(scanned.is_text, Some(true));
608        assert_eq!(scanned.is_script, Some(true));
609        assert_eq!(scanned.is_source, Some(true));
610    }
611
612    #[test]
613    fn scanner_skips_findings_for_zip_like_archives() {
614        let options = TextDetectionOptions {
615            collect_info: true,
616            detect_packages: false,
617            detect_application_packages: false,
618            detect_system_packages: false,
619            detect_packages_in_compiled: false,
620            detect_copyrights: true,
621            detect_generated: false,
622            detect_emails: true,
623            detect_urls: true,
624            max_emails: 50,
625            max_urls: 50,
626            timeout_seconds: 120.0,
627        };
628        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
629        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
630
631        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
632        assert_eq!(scanned.is_archive, Some(true));
633        assert!(scanned.license_detections.is_empty());
634        assert!(scanned.copyrights.is_empty());
635        assert!(scanned.emails.is_empty());
636        assert!(scanned.urls.is_empty());
637    }
638
639    #[test]
640    fn scanner_treats_typescript_sources_as_text_not_video_media() {
641        let options = TextDetectionOptions {
642            collect_info: true,
643            detect_packages: false,
644            detect_application_packages: false,
645            detect_system_packages: false,
646            detect_packages_in_compiled: false,
647            detect_copyrights: false,
648            detect_generated: false,
649            detect_emails: false,
650            detect_urls: false,
651            max_emails: 50,
652            max_urls: 50,
653            timeout_seconds: 120.0,
654        };
655        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
656
657        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
658        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
659        assert_eq!(
660            scanned.file_type_label.as_deref(),
661            Some("UTF-8 Unicode text")
662        );
663        assert_eq!(scanned.is_text, Some(true));
664        assert_eq!(scanned.is_media, Some(false));
665        assert_eq!(scanned.is_script, Some(false));
666        assert_eq!(scanned.is_source, Some(true));
667    }
668
669    #[test]
670    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
671        let options = TextDetectionOptions {
672            collect_info: true,
673            detect_packages: false,
674            detect_application_packages: false,
675            detect_system_packages: false,
676            detect_packages_in_compiled: false,
677            detect_copyrights: false,
678            detect_generated: false,
679            detect_emails: false,
680            detect_urls: false,
681            max_emails: 50,
682            max_urls: 50,
683            timeout_seconds: 120.0,
684        };
685        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
686
687        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
688        assert_eq!(
689            scanned.file_type_label.as_deref(),
690            Some("UTF-8 Unicode text")
691        );
692        assert_eq!(scanned.is_text, Some(true));
693        assert_eq!(scanned.is_media, Some(false));
694        assert_eq!(scanned.is_script, Some(false));
695        assert_eq!(scanned.is_source, Some(true));
696    }
697
698    #[test]
699    fn scanner_treats_empty_files_like_scancode_info_surface() {
700        let options = TextDetectionOptions {
701            collect_info: true,
702            detect_packages: false,
703            detect_application_packages: false,
704            detect_system_packages: false,
705            detect_packages_in_compiled: false,
706            detect_copyrights: false,
707            detect_generated: false,
708            detect_emails: false,
709            detect_urls: false,
710            max_emails: 50,
711            max_urls: 50,
712            timeout_seconds: 120.0,
713        };
714        let scanned = scan_single_file("test.txt", "", &options);
715
716        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
717        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
718        assert_eq!(scanned.programming_language, None);
719        assert_eq!(scanned.is_binary, Some(false));
720        assert_eq!(scanned.is_text, Some(true));
721        assert_eq!(scanned.is_archive, Some(false));
722        assert_eq!(scanned.is_media, Some(false));
723        assert_eq!(scanned.is_source, Some(false));
724        assert_eq!(scanned.is_script, Some(false));
725    }
726
727    #[test]
728    fn scanner_treats_package_json_as_text_not_source() {
729        let options = TextDetectionOptions {
730            collect_info: true,
731            detect_packages: false,
732            detect_application_packages: false,
733            detect_system_packages: false,
734            detect_packages_in_compiled: false,
735            detect_copyrights: false,
736            detect_generated: false,
737            detect_emails: false,
738            detect_urls: false,
739            max_emails: 50,
740            max_urls: 50,
741            timeout_seconds: 120.0,
742        };
743        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
744
745        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
746        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
747        assert_eq!(scanned.programming_language, None);
748        assert_eq!(scanned.is_text, Some(true));
749        assert_eq!(scanned.is_source, Some(false));
750        assert_eq!(scanned.is_script, Some(false));
751    }
752
753    #[test]
754    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
755        let options = TextDetectionOptions {
756            collect_info: true,
757            detect_packages: false,
758            detect_application_packages: false,
759            detect_system_packages: false,
760            detect_packages_in_compiled: false,
761            detect_copyrights: false,
762            detect_generated: false,
763            detect_emails: false,
764            detect_urls: false,
765            max_emails: 50,
766            max_urls: 50,
767            timeout_seconds: 120.0,
768        };
769
770        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
771        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
772
773        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
774        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
775        assert_eq!(gradle.is_source, Some(true));
776        assert_eq!(gradle.is_script, Some(false));
777
778        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
779        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
780        assert_eq!(nix.is_source, Some(true));
781        assert_eq!(nix.is_script, Some(false));
782    }
783
784    #[test]
785    fn scanner_treats_gitmodules_as_text_not_source() {
786        let options = TextDetectionOptions {
787            collect_info: true,
788            detect_packages: false,
789            detect_application_packages: false,
790            detect_system_packages: false,
791            detect_packages_in_compiled: false,
792            detect_copyrights: false,
793            detect_generated: false,
794            detect_emails: false,
795            detect_urls: false,
796            max_emails: 50,
797            max_urls: 50,
798            timeout_seconds: 120.0,
799        };
800        let scanned = scan_file_at_relative_path(
801            ".gitmodules",
802            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
803            &options,
804        );
805
806        assert_eq!(scanned.programming_language, None);
807        assert_eq!(
808            scanned.file_type_label.as_deref(),
809            Some("Git configuration text")
810        );
811        assert_eq!(scanned.is_text, Some(true));
812        assert_eq!(scanned.is_source, Some(false));
813        assert_eq!(scanned.is_script, Some(false));
814    }
815
816    #[test]
817    fn scanner_treats_javascript_shebang_files_as_scripts() {
818        let options = TextDetectionOptions {
819            collect_info: true,
820            detect_packages: false,
821            detect_application_packages: false,
822            detect_system_packages: false,
823            detect_packages_in_compiled: false,
824            detect_copyrights: false,
825            detect_generated: false,
826            detect_emails: false,
827            detect_urls: false,
828            max_emails: 50,
829            max_urls: 50,
830            timeout_seconds: 120.0,
831        };
832        let scanned = scan_file_at_relative_path(
833            "bin/run",
834            b"#!/usr/bin/env node\nconsole.log('hello');\n",
835            &options,
836        );
837
838        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
839        assert_eq!(
840            scanned.file_type_label.as_deref(),
841            Some("javascript script, UTF-8 Unicode text executable")
842        );
843        assert_eq!(scanned.is_script, Some(true));
844        assert_eq!(scanned.is_source, Some(true));
845    }
846
847    #[test]
848    fn scanner_treats_dockerfile_as_source() {
849        let options = TextDetectionOptions {
850            collect_info: true,
851            detect_packages: false,
852            detect_application_packages: false,
853            detect_system_packages: false,
854            detect_packages_in_compiled: false,
855            detect_copyrights: false,
856            detect_generated: false,
857            detect_emails: false,
858            detect_urls: false,
859            max_emails: 50,
860            max_urls: 50,
861            timeout_seconds: 120.0,
862        };
863        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
864
865        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
866        assert_eq!(
867            scanned.file_type_label.as_deref(),
868            Some("UTF-8 Unicode text")
869        );
870        assert_eq!(scanned.is_source, Some(true));
871        assert_eq!(scanned.is_script, Some(false));
872    }
873
874    #[test]
875    fn scanner_treats_makefile_as_text_not_source() {
876        let options = TextDetectionOptions {
877            collect_info: true,
878            detect_packages: false,
879            detect_application_packages: false,
880            detect_system_packages: false,
881            detect_packages_in_compiled: false,
882            detect_copyrights: false,
883            detect_generated: false,
884            detect_emails: false,
885            detect_urls: false,
886            max_emails: 50,
887            max_urls: 50,
888            timeout_seconds: 120.0,
889        };
890        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
891
892        assert_eq!(scanned.programming_language, None);
893        assert_eq!(
894            scanned.file_type_label.as_deref(),
895            Some("UTF-8 Unicode text")
896        );
897        assert_eq!(scanned.is_text, Some(true));
898        assert_eq!(scanned.is_source, Some(false));
899        assert_eq!(scanned.is_script, Some(false));
900    }
901
902    #[test]
903    fn scanner_omits_info_surface_when_disabled() {
904        let options = TextDetectionOptions {
905            collect_info: false,
906            detect_packages: false,
907            detect_application_packages: false,
908            detect_system_packages: false,
909            detect_packages_in_compiled: false,
910            detect_copyrights: false,
911            detect_generated: false,
912            detect_emails: false,
913            detect_urls: false,
914            max_emails: 50,
915            max_urls: 50,
916            timeout_seconds: 120.0,
917        };
918        let scanned = scan_single_file(
919            "script.py",
920            "#!/usr/bin/env python3\nprint(\"hello\")\n",
921            &options,
922        );
923
924        assert!(scanned.sha1.is_none());
925        assert!(scanned.md5.is_none());
926        assert!(scanned.sha256.is_none());
927        assert!(scanned.sha1_git.is_none());
928        assert!(scanned.mime_type.is_none());
929        assert!(scanned.date.is_none());
930        assert!(scanned.programming_language.is_none());
931        assert!(scanned.is_binary.is_none());
932        assert!(scanned.is_text.is_none());
933        assert!(scanned.is_archive.is_none());
934        assert!(scanned.is_media.is_none());
935        assert!(scanned.is_script.is_none());
936        assert!(scanned.is_source.is_none());
937    }
938
939    #[test]
940    fn scanner_skips_package_parsing_when_disabled() {
941        let options = TextDetectionOptions {
942            collect_info: false,
943            detect_packages: false,
944            detect_application_packages: false,
945            detect_system_packages: false,
946            detect_packages_in_compiled: false,
947            detect_copyrights: false,
948            detect_generated: false,
949            detect_emails: false,
950            detect_urls: false,
951            max_emails: 50,
952            max_urls: 50,
953            timeout_seconds: 120.0,
954        };
955        let scanned = scan_single_file(
956            "package.json",
957            r#"{"name":"demo","version":"1.0.0"}"#,
958            &options,
959        );
960
961        assert!(
962            scanned.package_data.is_empty(),
963            "package_data: {:#?}",
964            scanned.package_data
965        );
966    }
967
968    #[test]
969    fn scanner_parses_package_manifests_when_enabled() {
970        let options = TextDetectionOptions {
971            collect_info: false,
972            detect_packages: true,
973            detect_application_packages: true,
974            detect_system_packages: false,
975            detect_packages_in_compiled: false,
976            detect_copyrights: false,
977            detect_generated: false,
978            detect_emails: false,
979            detect_urls: false,
980            max_emails: 50,
981            max_urls: 50,
982            timeout_seconds: 120.0,
983        };
984        let scanned = scan_single_file(
985            "package.json",
986            r#"{"name":"demo","version":"1.0.0"}"#,
987            &options,
988        );
989
990        assert_eq!(
991            scanned.package_data.len(),
992            1,
993            "package_data: {:#?}",
994            scanned.package_data
995        );
996    }
997
998    #[test]
999    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1000        let options = TextDetectionOptions {
1001            collect_info: false,
1002            detect_packages: true,
1003            detect_application_packages: false,
1004            detect_system_packages: true,
1005            detect_packages_in_compiled: false,
1006            detect_copyrights: false,
1007            detect_generated: false,
1008            detect_emails: false,
1009            detect_urls: false,
1010            max_emails: 50,
1011            max_urls: 50,
1012            timeout_seconds: 120.0,
1013        };
1014        let scanned = scan_single_file(
1015            "package.json",
1016            r#"{"name":"demo","version":"1.0.0"}"#,
1017            &options,
1018        );
1019
1020        assert!(
1021            scanned.package_data.is_empty(),
1022            "package_data: {:#?}",
1023            scanned.package_data
1024        );
1025    }
1026
1027    #[test]
1028    fn scanner_parses_system_package_files_when_enabled() {
1029        let options = TextDetectionOptions {
1030            collect_info: false,
1031            detect_packages: true,
1032            detect_application_packages: false,
1033            detect_system_packages: true,
1034            detect_packages_in_compiled: false,
1035            detect_copyrights: false,
1036            detect_generated: false,
1037            detect_emails: false,
1038            detect_urls: false,
1039            max_emails: 50,
1040            max_urls: 50,
1041            timeout_seconds: 120.0,
1042        };
1043        let scanned = scan_file_at_relative_path(
1044            "var/lib/dpkg/status",
1045            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1046            &options,
1047        );
1048
1049        assert!(
1050            !scanned.package_data.is_empty(),
1051            "package_data: {:#?}",
1052            scanned.package_data
1053        );
1054    }
1055
1056    #[test]
1057    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1058        if std::process::Command::new("go")
1059            .arg("version")
1060            .status()
1061            .is_err()
1062        {
1063            return;
1064        }
1065
1066        let temp_dir = TempDir::new().expect("create temp dir");
1067        fs::write(
1068            temp_dir.path().join("go.mod"),
1069            "module example.com/demo\n\ngo 1.23.0\n",
1070        )
1071        .expect("write go.mod");
1072        fs::write(
1073            temp_dir.path().join("main.go"),
1074            "package main\nfunc main() {}\n",
1075        )
1076        .expect("write main.go");
1077        let file_path = temp_dir.path().join("demo");
1078        let status = std::process::Command::new("go")
1079            .current_dir(temp_dir.path())
1080            .args(["build", "-o"])
1081            .arg(&file_path)
1082            .status()
1083            .expect("run go build");
1084        assert!(status.success());
1085
1086        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1087        let collected = collect_paths(temp_dir.path(), 0, &[]);
1088
1089        let without_compiled = process_collected(
1090            &collected,
1091            Arc::clone(&progress),
1092            None,
1093            LicenseScanOptions::default(),
1094            &TextDetectionOptions {
1095                collect_info: false,
1096                detect_packages: true,
1097                detect_application_packages: true,
1098                detect_system_packages: false,
1099                detect_packages_in_compiled: false,
1100                detect_copyrights: false,
1101                detect_generated: false,
1102                detect_emails: false,
1103                detect_urls: false,
1104                max_emails: 50,
1105                max_urls: 50,
1106                timeout_seconds: 120.0,
1107            },
1108        );
1109        let with_compiled = process_collected(
1110            &collected,
1111            progress,
1112            None,
1113            LicenseScanOptions::default(),
1114            &TextDetectionOptions {
1115                collect_info: false,
1116                detect_packages: true,
1117                detect_application_packages: true,
1118                detect_system_packages: false,
1119                detect_packages_in_compiled: true,
1120                detect_copyrights: false,
1121                detect_generated: false,
1122                detect_emails: false,
1123                detect_urls: false,
1124                max_emails: 50,
1125                max_urls: 50,
1126                timeout_seconds: 120.0,
1127            },
1128        );
1129
1130        let without_compiled = without_compiled
1131            .files
1132            .into_iter()
1133            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1134            .expect("compiled artifact present");
1135        let with_compiled = with_compiled
1136            .files
1137            .into_iter()
1138            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1139            .expect("compiled artifact present");
1140
1141        assert!(
1142            without_compiled.package_data.is_empty(),
1143            "package_data: {:#?}",
1144            without_compiled.package_data
1145        );
1146        assert!(!with_compiled.package_data.is_empty());
1147    }
1148
1149    #[test]
1150    fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1151        let temp_dir = TempDir::new().expect("create temp dir");
1152        let file_path = temp_dir.path().join("libiconv2.dll");
1153        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1154            .expect("read PE fixture");
1155        fs::write(&file_path, fixture).expect("write PE fixture");
1156
1157        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1158        let collected = collect_paths(temp_dir.path(), 0, &[]);
1159
1160        let without_package = process_collected(
1161            &collected,
1162            Arc::clone(&progress),
1163            None,
1164            LicenseScanOptions::default(),
1165            &TextDetectionOptions {
1166                collect_info: false,
1167                detect_packages: false,
1168                detect_application_packages: false,
1169                detect_system_packages: false,
1170                detect_packages_in_compiled: false,
1171                detect_copyrights: false,
1172                detect_generated: false,
1173                detect_emails: false,
1174                detect_urls: false,
1175                max_emails: 50,
1176                max_urls: 50,
1177                timeout_seconds: 120.0,
1178            },
1179        );
1180        let with_package = process_collected(
1181            &collected,
1182            progress,
1183            None,
1184            LicenseScanOptions::default(),
1185            &TextDetectionOptions {
1186                collect_info: false,
1187                detect_packages: true,
1188                detect_application_packages: true,
1189                detect_system_packages: false,
1190                detect_packages_in_compiled: false,
1191                detect_copyrights: false,
1192                detect_generated: false,
1193                detect_emails: false,
1194                detect_urls: false,
1195                max_emails: 50,
1196                max_urls: 50,
1197                timeout_seconds: 120.0,
1198            },
1199        );
1200
1201        let without_package = without_package
1202            .files
1203            .into_iter()
1204            .find(|entry| {
1205                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1206            })
1207            .expect("compiled artifact present");
1208        let with_package = with_package
1209            .files
1210            .into_iter()
1211            .find(|entry| {
1212                entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1213            })
1214            .expect("compiled artifact present");
1215
1216        assert!(without_package.package_data.is_empty());
1217        assert_eq!(with_package.package_data.len(), 1);
1218        assert_eq!(
1219            with_package.package_data[0].package_type,
1220            Some(FilePackageType::Winexe)
1221        );
1222        assert_eq!(
1223            with_package.package_data[0].datasource_id,
1224            Some(DatasourceId::WindowsExecutable)
1225        );
1226    }
1227
1228    #[test]
1229    fn scanner_detects_license_from_font_metadata() {
1230        let temp_dir = TempDir::new().expect("create temp dir");
1231        let file_path = temp_dir.path().join("Lato-Bold.ttf");
1232        let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1233        fs::write(&file_path, fixture).expect("write font fixture");
1234
1235        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1236        let collected = collect_paths(temp_dir.path(), 0, &[]);
1237        let engine =
1238            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1239        let result = process_collected(
1240            &collected,
1241            progress,
1242            Some(engine),
1243            LicenseScanOptions::default(),
1244            &TextDetectionOptions::default(),
1245        );
1246        let scanned = result
1247            .files
1248            .into_iter()
1249            .find(|entry| {
1250                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1251            })
1252            .expect("scanned file entry");
1253
1254        assert!(
1255            scanned.license_expression.is_some(),
1256            "license detections: {:#?}",
1257            scanned.license_detections
1258        );
1259        assert!(
1260            scanned
1261                .license_expression
1262                .as_deref()
1263                .is_some_and(
1264                    |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1265                ),
1266            "license expression: {:?}",
1267            scanned.license_expression
1268        );
1269    }
1270
1271    #[test]
1272    fn scanner_detects_license_from_windows_executable_metadata() {
1273        let temp_dir = TempDir::new().expect("create temp dir");
1274        let file_path = temp_dir.path().join("libiconv2.dll");
1275        let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1276            .expect("read PE fixture");
1277        fs::write(&file_path, fixture).expect("write PE fixture");
1278
1279        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1280        let collected = collect_paths(temp_dir.path(), 0, &[]);
1281        let engine =
1282            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1283        let result = process_collected(
1284            &collected,
1285            progress,
1286            Some(engine),
1287            LicenseScanOptions::default(),
1288            &TextDetectionOptions::default(),
1289        );
1290        let scanned = result
1291            .files
1292            .into_iter()
1293            .find(|entry| {
1294                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1295            })
1296            .expect("scanned file entry");
1297
1298        assert!(
1299            scanned.license_expression.is_some(),
1300            "license detections: {:#?}",
1301            scanned.license_detections
1302        );
1303        assert!(
1304            scanned
1305                .license_expression
1306                .as_deref()
1307                .is_some_and(|expression| {
1308                    expression.contains("lgpl") || expression.contains("LGPL")
1309                }),
1310            "license expression: {:?}",
1311            scanned.license_expression
1312        );
1313    }
1314
1315    #[test]
1316    fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1317        let scanned = scan_single_file_with_license_engine(
1318            "navbar.md",
1319            "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1320            &TextDetectionOptions::default(),
1321        );
1322
1323        assert!(
1324            scanned
1325                .license_expression
1326                .as_deref()
1327                .is_some_and(|expression| {
1328                    expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1329                }),
1330            "license expression: {:?}",
1331            scanned.license_expression
1332        );
1333    }
1334
1335    #[test]
1336    fn scanner_detects_mit_license_from_shields_badge_markdown() {
1337        let scanned = scan_single_file_with_license_engine(
1338            "README.md",
1339            "[![](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n",
1340            &TextDetectionOptions::default(),
1341        );
1342
1343        assert!(
1344            scanned
1345                .license_expression
1346                .as_deref()
1347                .is_some_and(|expression| {
1348                    expression.contains("mit") || expression.contains("MIT")
1349                }),
1350            "license expression: {:?}",
1351            scanned.license_expression
1352        );
1353    }
1354
1355    #[test]
1356    fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1357        let scanned = scan_single_file_with_license_engine(
1358            "README.md",
1359            "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1360            &TextDetectionOptions::default(),
1361        );
1362
1363        assert!(
1364            scanned
1365                .license_expression
1366                .as_deref()
1367                .is_some_and(|expression| {
1368                    expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1369                }),
1370            "license expression: {:?}",
1371            scanned.license_expression
1372        );
1373    }
1374
1375    #[test]
1376    fn scanner_sets_is_source_only_when_info_enabled() {
1377        let without_info = TextDetectionOptions {
1378            collect_info: false,
1379            detect_packages: false,
1380            detect_application_packages: false,
1381            detect_system_packages: false,
1382            detect_packages_in_compiled: false,
1383            detect_copyrights: false,
1384            detect_generated: false,
1385            detect_emails: false,
1386            detect_urls: false,
1387            max_emails: 50,
1388            max_urls: 50,
1389            timeout_seconds: 120.0,
1390        };
1391        let with_info = TextDetectionOptions {
1392            collect_info: true,
1393            ..without_info.clone()
1394        };
1395
1396        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1397        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1398
1399        assert_eq!(scanned_without_info.is_source, None);
1400        assert_eq!(scanned_with_info.is_source, Some(true));
1401    }
1402
1403    #[test]
1404    fn directory_omits_info_fields_when_info_disabled() {
1405        let temp_dir = TempDir::new().expect("create temp dir");
1406        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1407
1408        let collected = collect_paths(temp_dir.path(), 0, &[]);
1409        let result = process_collected(
1410            &collected,
1411            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1412            None,
1413            LicenseScanOptions::default(),
1414            &TextDetectionOptions {
1415                collect_info: false,
1416                detect_packages: false,
1417                detect_application_packages: false,
1418                detect_system_packages: false,
1419                detect_packages_in_compiled: false,
1420                detect_copyrights: false,
1421                detect_generated: false,
1422                detect_emails: false,
1423                detect_urls: false,
1424                max_emails: 50,
1425                max_urls: 50,
1426                timeout_seconds: 120.0,
1427            },
1428        );
1429
1430        let directory = result
1431            .files
1432            .into_iter()
1433            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1434            .expect("directory entry");
1435
1436        assert!(directory.date.is_none());
1437        assert!(directory.file_type_label.is_none());
1438        assert!(directory.is_binary.is_none());
1439        assert!(directory.is_text.is_none());
1440        assert!(directory.is_archive.is_none());
1441        assert!(directory.is_media.is_none());
1442        assert!(directory.is_source.is_none());
1443        assert!(directory.is_script.is_none());
1444    }
1445
1446    #[test]
1447    fn directory_includes_info_fields_when_info_enabled() {
1448        let temp_dir = TempDir::new().expect("create temp dir");
1449        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1450
1451        let collected = collect_paths(temp_dir.path(), 0, &[]);
1452        let result = process_collected(
1453            &collected,
1454            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1455            None,
1456            LicenseScanOptions::default(),
1457            &TextDetectionOptions {
1458                collect_info: true,
1459                detect_packages: false,
1460                detect_application_packages: false,
1461                detect_system_packages: false,
1462                detect_packages_in_compiled: false,
1463                detect_copyrights: false,
1464                detect_generated: false,
1465                detect_emails: false,
1466                detect_urls: false,
1467                max_emails: 50,
1468                max_urls: 50,
1469                timeout_seconds: 120.0,
1470            },
1471        );
1472
1473        let directory = result
1474            .files
1475            .into_iter()
1476            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1477            .expect("directory entry");
1478
1479        assert!(directory.date.is_none());
1480        assert!(directory.file_type_label.is_none());
1481        assert_eq!(directory.is_binary, Some(false));
1482        assert_eq!(directory.is_text, Some(false));
1483        assert_eq!(directory.is_archive, Some(false));
1484        assert_eq!(directory.is_media, Some(false));
1485        assert_eq!(directory.is_source, Some(false));
1486        assert_eq!(directory.is_script, Some(false));
1487        assert_eq!(directory.files_count, Some(0));
1488        assert_eq!(directory.dirs_count, Some(0));
1489        assert_eq!(directory.size_count, Some(0));
1490    }
1491
1492    #[test]
1493    fn collect_paths_includes_root_directory_entry() {
1494        let temp_dir = TempDir::new().expect("create temp dir");
1495        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1496        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1497            .expect("write nested file");
1498
1499        let collected = collect_paths(temp_dir.path(), 0, &[]);
1500
1501        assert!(
1502            collected
1503                .directories
1504                .iter()
1505                .any(|(path, _)| path == temp_dir.path())
1506        );
1507    }
1508
1509    #[test]
1510    fn collect_paths_supports_single_file_input() {
1511        let temp_dir = TempDir::new().expect("create temp dir");
1512        let file_path = temp_dir.path().join("main.rs");
1513        fs::write(&file_path, "fn main() {}\n").expect("write file");
1514
1515        let collected = collect_paths(&file_path, 0, &[]);
1516
1517        assert_eq!(collected.files.len(), 1);
1518        assert!(collected.directories.is_empty());
1519        assert_eq!(collected.files[0].0, file_path);
1520    }
1521
1522    #[test]
1523    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1524        let temp_dir = TempDir::new().expect("create temp dir");
1525        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1526        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1527
1528        let collected = collect_paths(temp_dir.path(), 0, &[]);
1529        let result = process_collected_with_memory_limit(
1530            &collected,
1531            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1532            None,
1533            LicenseScanOptions::default(),
1534            &TextDetectionOptions {
1535                collect_info: false,
1536                detect_packages: false,
1537                detect_application_packages: false,
1538                detect_system_packages: false,
1539                detect_packages_in_compiled: false,
1540                detect_copyrights: false,
1541                detect_generated: false,
1542                detect_emails: false,
1543                detect_urls: false,
1544                max_emails: 50,
1545                max_urls: 50,
1546                timeout_seconds: 120.0,
1547            },
1548            1,
1549        );
1550
1551        assert_eq!(result.files.len(), 3);
1552    }
1553
1554    #[test]
1555    fn process_collected_with_negative_one_uses_disk_only_mode() {
1556        let temp_dir = TempDir::new().expect("create temp dir");
1557        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1558
1559        let collected = collect_paths(temp_dir.path(), 0, &[]);
1560        let result = process_collected_with_memory_limit(
1561            &collected,
1562            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1563            None,
1564            LicenseScanOptions::default(),
1565            &TextDetectionOptions {
1566                collect_info: false,
1567                detect_packages: false,
1568                detect_application_packages: false,
1569                detect_system_packages: false,
1570                detect_packages_in_compiled: false,
1571                detect_copyrights: false,
1572                detect_generated: false,
1573                detect_emails: false,
1574                detect_urls: false,
1575                max_emails: 50,
1576                max_urls: 50,
1577                timeout_seconds: 120.0,
1578            },
1579            -1,
1580        );
1581
1582        assert_eq!(result.files.len(), 2);
1583    }
1584}