Skip to main content

provenant/scanner/
mod.rs

1mod collect;
2mod process;
3
4use crate::license_detection::LicenseDetectionEngine;
5use crate::models::FileInfo;
6
7pub struct ProcessResult {
8    pub files: Vec<FileInfo>,
9    pub excluded_count: usize,
10}
11
12#[derive(Debug, Clone, Copy, Default)]
13pub struct LicenseScanOptions {
14    pub include_text: bool,
15    pub include_text_diagnostics: bool,
16    pub include_diagnostics: bool,
17    pub unknown_licenses: bool,
18    pub min_score: u8,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23    pub collect_info: bool,
24    pub detect_packages: bool,
25    pub detect_application_packages: bool,
26    pub detect_system_packages: bool,
27    pub detect_packages_in_compiled: bool,
28    pub detect_copyrights: bool,
29    pub detect_generated: bool,
30    pub detect_emails: bool,
31    pub detect_urls: bool,
32    pub max_emails: usize,
33    pub max_urls: usize,
34    pub timeout_seconds: f64,
35}
36
37impl Default for TextDetectionOptions {
38    fn default() -> Self {
39        Self {
40            collect_info: false,
41            detect_packages: false,
42            detect_application_packages: false,
43            detect_system_packages: false,
44            detect_packages_in_compiled: false,
45            detect_copyrights: true,
46            detect_generated: false,
47            detect_emails: false,
48            detect_urls: false,
49            max_emails: 50,
50            max_urls: 50,
51            timeout_seconds: 120.0,
52        }
53    }
54}
55
56pub fn scan_options_fingerprint(
57    text_options: &TextDetectionOptions,
58    license_options: LicenseScanOptions,
59    license_engine: Option<&LicenseDetectionEngine>,
60) -> String {
61    let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
62        Some(engine) => {
63            let rules = &engine.index().rules_by_rid;
64            (
65                true,
66                rules.len(),
67                rules
68                    .first()
69                    .map(|rule| rule.identifier.as_str())
70                    .unwrap_or(""),
71                rules
72                    .last()
73                    .map(|rule| rule.identifier.as_str())
74                    .unwrap_or(""),
75            )
76        }
77        None => (false, 0, "", ""),
78    };
79
80    format!(
81        "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
82        env!("CARGO_PKG_VERSION"),
83        text_options.collect_info,
84        text_options.detect_packages,
85        text_options.detect_application_packages,
86        text_options.detect_system_packages,
87        text_options.detect_packages_in_compiled,
88        text_options.detect_copyrights,
89        text_options.detect_generated,
90        text_options.detect_emails,
91        text_options.detect_urls,
92        text_options.max_emails,
93        text_options.max_urls,
94        text_options.timeout_seconds,
95        license_enabled,
96        rules_count,
97        first_rule_id,
98        last_rule_id,
99        license_options.include_text,
100        license_options.include_text_diagnostics,
101        license_options.include_diagnostics,
102        license_options.unknown_licenses,
103        license_options.min_score,
104    )
105}
106
107pub use self::collect::{CollectedPaths, collect_paths};
108#[allow(unused_imports)]
109pub use self::process::{process_collected, process_collected_with_memory_limit};
110
111#[cfg(test)]
112mod tests {
113    use std::fs;
114    use std::sync::Arc;
115
116    use tempfile::TempDir;
117
118    use crate::license_detection::LicenseDetectionEngine;
119    use crate::models::FileType;
120    use crate::progress::{ProgressMode, ScanProgress};
121
122    use super::{
123        LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected,
124        process_collected_with_memory_limit,
125    };
126
127    #[test]
128    fn default_options_keep_copyright_detection_enabled() {
129        let options = TextDetectionOptions::default();
130        assert!(!options.detect_packages);
131        assert!(options.detect_copyrights);
132    }
133
134    fn scan_single_file(
135        file_name: &str,
136        content: &str,
137        options: &TextDetectionOptions,
138    ) -> crate::models::FileInfo {
139        let temp_dir = TempDir::new().expect("create temp dir");
140        let file_path = temp_dir.path().join(file_name);
141        fs::write(&file_path, content).expect("write test file");
142
143        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
144        let collected = collect_paths(temp_dir.path(), 0, &[]);
145        let result = process_collected(
146            &collected,
147            progress,
148            None,
149            LicenseScanOptions::default(),
150            options,
151        );
152
153        result
154            .files
155            .into_iter()
156            .find(|entry| {
157                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
158            })
159            .expect("scanned file entry")
160    }
161
162    fn scan_file_at_relative_path(
163        relative_path: &str,
164        content: &[u8],
165        options: &TextDetectionOptions,
166    ) -> crate::models::FileInfo {
167        let temp_dir = TempDir::new().expect("create temp dir");
168        let file_path = temp_dir.path().join(relative_path);
169        if let Some(parent) = file_path.parent() {
170            fs::create_dir_all(parent).expect("create parent dirs");
171        }
172        fs::write(&file_path, content).expect("write test file");
173
174        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
175        let collected = collect_paths(temp_dir.path(), 0, &[]);
176        let result = process_collected(
177            &collected,
178            progress,
179            None,
180            LicenseScanOptions::default(),
181            options,
182        );
183
184        result
185            .files
186            .into_iter()
187            .find(|entry| {
188                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
189            })
190            .expect("scanned file entry")
191    }
192
193    #[test]
194    fn scanner_reports_repeated_email_occurrences() {
195        let options = TextDetectionOptions {
196            collect_info: false,
197            detect_packages: false,
198            detect_application_packages: false,
199            detect_system_packages: false,
200            detect_packages_in_compiled: false,
201            detect_copyrights: false,
202            detect_generated: false,
203            detect_emails: true,
204            detect_urls: false,
205            max_emails: 50,
206            max_urls: 50,
207            timeout_seconds: 120.0,
208        };
209        let scanned = scan_single_file(
210            "contacts.txt",
211            "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
212            &options,
213        );
214
215        let emails: Vec<(&str, usize)> = scanned
216            .emails
217            .iter()
218            .map(|email| (email.email.as_str(), email.start_line))
219            .collect();
220
221        assert_eq!(emails.len(), 4, "emails: {emails:#?}");
222        assert_eq!(
223            emails,
224            vec![
225                ("linux@3ware.com", 1),
226                ("linux@3ware.com", 2),
227                ("andre@suse.com", 3),
228                ("linux@3ware.com", 4),
229            ]
230        );
231    }
232
233    #[test]
234    fn scanner_skips_pem_certificate_text_detection() {
235        let options = TextDetectionOptions {
236            collect_info: false,
237            detect_packages: false,
238            detect_application_packages: false,
239            detect_system_packages: false,
240            detect_packages_in_compiled: false,
241            detect_copyrights: true,
242            detect_generated: false,
243            detect_emails: true,
244            detect_urls: true,
245            max_emails: 50,
246            max_urls: 50,
247            timeout_seconds: 120.0,
248        };
249        let pem_fixture = concat!(
250            "-----BEGIN CERTIFICATE-----\n",
251            "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
252            "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
253            "-----END CERTIFICATE-----\n",
254            "Certificate:\n",
255            "    Data:\n",
256            "        Signature Algorithm: sha1WithRSAEncryption\n",
257            "        Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
258            "        Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
259            "        Contact: cert-owner@example.com\n",
260        );
261        let scanned = scan_single_file("cert.pem", pem_fixture, &options);
262
263        assert!(
264            scanned.copyrights.is_empty(),
265            "copyrights: {:#?}",
266            scanned.copyrights
267        );
268        assert!(
269            scanned.holders.is_empty(),
270            "holders: {:#?}",
271            scanned.holders
272        );
273        assert!(
274            scanned.authors.is_empty(),
275            "authors: {:#?}",
276            scanned.authors
277        );
278        assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
279        assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
280        assert!(
281            scanned.license_detections.is_empty(),
282            "licenses: {:#?}",
283            scanned.license_detections
284        );
285        assert!(
286            scanned.license_clues.is_empty(),
287            "license clues: {:#?}",
288            scanned.license_clues
289        );
290    }
291
292    #[test]
293    fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
294        let options = TextDetectionOptions {
295            collect_info: false,
296            detect_packages: false,
297            detect_application_packages: false,
298            detect_system_packages: false,
299            detect_packages_in_compiled: false,
300            detect_copyrights: true,
301            detect_generated: false,
302            detect_emails: false,
303            detect_urls: true,
304            max_emails: 50,
305            max_urls: 50,
306            timeout_seconds: 120.0,
307        };
308        let fixture = concat!(
309            "/*\n",
310            "Copyright 2022 The Kubernetes Authors.\n\n",
311            "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
312            "you may not use this file except in compliance with the License.\n",
313            "You may obtain a copy of the License at\n\n",
314            "    http://www.apache.org/licenses/LICENSE-2.0\n",
315            "*/\n\n",
316            "package storage\n\n",
317            "const validCert = `\n",
318            "-----BEGIN CERTIFICATE-----\n",
319            "MIIDmTCCAoGgAwIBAgIUWQ==\n",
320            "-----END CERTIFICATE-----\n",
321            "`\n",
322        );
323        let temp_dir = TempDir::new().expect("create temp dir");
324        let file_path = temp_dir.path().join("storage_test.go");
325        fs::write(&file_path, fixture).expect("write fixture");
326
327        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
328        let collected = collect_paths(temp_dir.path(), 0, &[]);
329        let engine =
330            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
331        let result = process_collected(
332            &collected,
333            progress,
334            Some(engine),
335            LicenseScanOptions::default(),
336            &options,
337        );
338        let scanned = result
339            .files
340            .into_iter()
341            .find(|entry| {
342                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
343            })
344            .expect("scanned file entry");
345
346        assert!(
347            scanned
348                .copyrights
349                .iter()
350                .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
351            "copyrights: {:#?}",
352            scanned.copyrights
353        );
354        assert!(
355            scanned
356                .holders
357                .iter()
358                .any(|h| h.holder == "The Kubernetes Authors"),
359            "holders: {:#?}",
360            scanned.holders
361        );
362        assert!(
363            scanned
364                .urls
365                .iter()
366                .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
367            "urls: {:#?}",
368            scanned.urls
369        );
370        assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
371    }
372
373    #[test]
374    fn scanner_detects_structured_credits_authors() {
375        let options = TextDetectionOptions {
376            collect_info: false,
377            detect_packages: false,
378            detect_application_packages: false,
379            detect_system_packages: false,
380            detect_packages_in_compiled: false,
381            detect_copyrights: true,
382            detect_generated: false,
383            detect_emails: false,
384            detect_urls: false,
385            max_emails: 50,
386            max_urls: 50,
387            timeout_seconds: 120.0,
388        };
389        let credits_fixture = concat!(
390            "N: Jack Lloyd\n",
391            "E: lloyd@randombit.net\n",
392            "W: http://www.randombit.net/\n",
393        );
394        let scanned = scan_single_file("CREDITS", credits_fixture, &options);
395
396        let authors: Vec<(&str, usize, usize)> = scanned
397            .authors
398            .iter()
399            .map(|author| (author.author.as_str(), author.start_line, author.end_line))
400            .collect();
401
402        assert_eq!(
403            authors,
404            vec![(
405                "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
406                1,
407                3,
408            )]
409        );
410        assert!(scanned.copyrights.is_empty());
411        assert!(scanned.holders.is_empty());
412    }
413
414    #[test]
415    fn scanner_uses_or_for_alternative_license_header() {
416        let fixture =
417            include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
418        let temp_dir = TempDir::new().expect("create temp dir");
419        let file_path = temp_dir.path().join("d2s.ipp");
420        fs::write(&file_path, fixture).expect("write fixture");
421
422        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
423        let collected = collect_paths(temp_dir.path(), 0, &[]);
424        let engine =
425            Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
426        let result = process_collected(
427            &collected,
428            progress,
429            Some(engine),
430            LicenseScanOptions::default(),
431            &TextDetectionOptions::default(),
432        );
433        let scanned = result
434            .files
435            .into_iter()
436            .find(|entry| {
437                entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
438            })
439            .expect("scanned file entry");
440
441        assert_eq!(
442            scanned.license_expression.as_deref(),
443            Some("Apache-2.0 OR BSL-1.0")
444        );
445        assert!(
446            scanned.license_clues.is_empty(),
447            "license clues: {:#?}",
448            scanned.license_clues
449        );
450        assert_eq!(
451            scanned.license_detections.len(),
452            1,
453            "detections: {:#?}",
454            scanned.license_detections
455        );
456
457        let detection = &scanned.license_detections[0];
458        assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
459
460        let match_expressions: Vec<_> = detection
461            .matches
462            .iter()
463            .map(|m| m.license_expression_spdx.as_str())
464            .collect();
465        assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
466    }
467
468    #[test]
469    fn scanner_sets_generated_flag_when_enabled() {
470        let options = TextDetectionOptions {
471            collect_info: false,
472            detect_packages: false,
473            detect_application_packages: false,
474            detect_system_packages: false,
475            detect_packages_in_compiled: false,
476            detect_copyrights: false,
477            detect_generated: true,
478            detect_emails: false,
479            detect_urls: false,
480            max_emails: 50,
481            max_urls: 50,
482            timeout_seconds: 120.0,
483        };
484        let scanned = scan_single_file(
485            "generated.c",
486            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
487            &options,
488        );
489
490        assert_eq!(scanned.is_generated, Some(true));
491    }
492
493    #[test]
494    fn scanner_leaves_generated_flag_unset_when_disabled() {
495        let options = TextDetectionOptions {
496            collect_info: false,
497            detect_packages: false,
498            detect_application_packages: false,
499            detect_system_packages: false,
500            detect_packages_in_compiled: false,
501            detect_copyrights: false,
502            detect_generated: false,
503            detect_emails: false,
504            detect_urls: false,
505            max_emails: 50,
506            max_urls: 50,
507            timeout_seconds: 120.0,
508        };
509        let scanned = scan_single_file(
510            "generated.c",
511            "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
512            &options,
513        );
514
515        assert_eq!(scanned.is_generated, None);
516    }
517
518    #[test]
519    fn scanner_populates_info_surface_when_enabled() {
520        let options = TextDetectionOptions {
521            collect_info: true,
522            detect_packages: false,
523            detect_application_packages: false,
524            detect_system_packages: false,
525            detect_packages_in_compiled: false,
526            detect_copyrights: false,
527            detect_generated: false,
528            detect_emails: false,
529            detect_urls: false,
530            max_emails: 50,
531            max_urls: 50,
532            timeout_seconds: 120.0,
533        };
534        let scanned = scan_single_file(
535            "script.py",
536            "#!/usr/bin/env python3\nprint(\"hello\")\n",
537            &options,
538        );
539
540        assert!(scanned.sha1.is_some());
541        assert!(scanned.md5.is_some());
542        assert!(scanned.sha256.is_some());
543        assert!(scanned.sha1_git.is_some());
544        assert!(scanned.mime_type.is_some());
545        assert!(scanned.date.is_some());
546        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
547        assert_eq!(scanned.is_text, Some(true));
548        assert_eq!(scanned.is_script, Some(true));
549        assert_eq!(scanned.is_source, Some(true));
550    }
551
552    #[test]
553    fn scanner_treats_latin1_python_sources_as_textual_scripts() {
554        let options = TextDetectionOptions {
555            collect_info: true,
556            detect_packages: false,
557            detect_application_packages: false,
558            detect_system_packages: false,
559            detect_packages_in_compiled: false,
560            detect_copyrights: false,
561            detect_generated: false,
562            detect_emails: false,
563            detect_urls: false,
564            max_emails: 50,
565            max_urls: 50,
566            timeout_seconds: 120.0,
567        };
568        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
569        let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
570
571        assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
572        assert_eq!(
573            scanned.file_type_label.as_deref(),
574            Some("python script, text executable")
575        );
576        assert_eq!(scanned.is_binary, Some(false));
577        assert_eq!(scanned.is_text, Some(true));
578        assert_eq!(scanned.is_script, Some(true));
579        assert_eq!(scanned.is_source, Some(true));
580    }
581
582    #[test]
583    fn scanner_skips_findings_for_zip_like_archives() {
584        let options = TextDetectionOptions {
585            collect_info: true,
586            detect_packages: false,
587            detect_application_packages: false,
588            detect_system_packages: false,
589            detect_packages_in_compiled: false,
590            detect_copyrights: true,
591            detect_generated: false,
592            detect_emails: true,
593            detect_urls: true,
594            max_emails: 50,
595            max_urls: 50,
596            timeout_seconds: 120.0,
597        };
598        let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
599        let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
600
601        assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
602        assert_eq!(scanned.is_archive, Some(true));
603        assert!(scanned.license_detections.is_empty());
604        assert!(scanned.copyrights.is_empty());
605        assert!(scanned.emails.is_empty());
606        assert!(scanned.urls.is_empty());
607    }
608
609    #[test]
610    fn scanner_treats_typescript_sources_as_text_not_video_media() {
611        let options = TextDetectionOptions {
612            collect_info: true,
613            detect_packages: false,
614            detect_application_packages: false,
615            detect_system_packages: false,
616            detect_packages_in_compiled: false,
617            detect_copyrights: false,
618            detect_generated: false,
619            detect_emails: false,
620            detect_urls: false,
621            max_emails: 50,
622            max_urls: 50,
623            timeout_seconds: 120.0,
624        };
625        let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
626
627        assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
628        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
629        assert_eq!(
630            scanned.file_type_label.as_deref(),
631            Some("UTF-8 Unicode text")
632        );
633        assert_eq!(scanned.is_text, Some(true));
634        assert_eq!(scanned.is_media, Some(false));
635        assert_eq!(scanned.is_script, Some(false));
636        assert_eq!(scanned.is_source, Some(true));
637    }
638
639    #[test]
640    fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
641        let options = TextDetectionOptions {
642            collect_info: true,
643            detect_packages: false,
644            detect_application_packages: false,
645            detect_system_packages: false,
646            detect_packages_in_compiled: false,
647            detect_copyrights: false,
648            detect_generated: false,
649            detect_emails: false,
650            detect_urls: false,
651            max_emails: 50,
652            max_urls: 50,
653            timeout_seconds: 120.0,
654        };
655        let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
656
657        assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
658        assert_eq!(
659            scanned.file_type_label.as_deref(),
660            Some("UTF-8 Unicode text")
661        );
662        assert_eq!(scanned.is_text, Some(true));
663        assert_eq!(scanned.is_media, Some(false));
664        assert_eq!(scanned.is_script, Some(false));
665        assert_eq!(scanned.is_source, Some(true));
666    }
667
668    #[test]
669    fn scanner_treats_empty_files_like_scancode_info_surface() {
670        let options = TextDetectionOptions {
671            collect_info: true,
672            detect_packages: false,
673            detect_application_packages: false,
674            detect_system_packages: false,
675            detect_packages_in_compiled: false,
676            detect_copyrights: false,
677            detect_generated: false,
678            detect_emails: false,
679            detect_urls: false,
680            max_emails: 50,
681            max_urls: 50,
682            timeout_seconds: 120.0,
683        };
684        let scanned = scan_single_file("test.txt", "", &options);
685
686        assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
687        assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
688        assert_eq!(scanned.programming_language, None);
689        assert_eq!(scanned.is_binary, Some(false));
690        assert_eq!(scanned.is_text, Some(true));
691        assert_eq!(scanned.is_archive, Some(false));
692        assert_eq!(scanned.is_media, Some(false));
693        assert_eq!(scanned.is_source, Some(false));
694        assert_eq!(scanned.is_script, Some(false));
695    }
696
697    #[test]
698    fn scanner_treats_package_json_as_text_not_source() {
699        let options = TextDetectionOptions {
700            collect_info: true,
701            detect_packages: false,
702            detect_application_packages: false,
703            detect_system_packages: false,
704            detect_packages_in_compiled: false,
705            detect_copyrights: false,
706            detect_generated: false,
707            detect_emails: false,
708            detect_urls: false,
709            max_emails: 50,
710            max_urls: 50,
711            timeout_seconds: 120.0,
712        };
713        let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
714
715        assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
716        assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
717        assert_eq!(scanned.programming_language, None);
718        assert_eq!(scanned.is_text, Some(true));
719        assert_eq!(scanned.is_source, Some(false));
720        assert_eq!(scanned.is_script, Some(false));
721    }
722
723    #[test]
724    fn scanner_classifies_gradle_and_nix_manifests_as_source() {
725        let options = TextDetectionOptions {
726            collect_info: true,
727            detect_packages: false,
728            detect_application_packages: false,
729            detect_system_packages: false,
730            detect_packages_in_compiled: false,
731            detect_copyrights: false,
732            detect_generated: false,
733            detect_emails: false,
734            detect_urls: false,
735            max_emails: 50,
736            max_urls: 50,
737            timeout_seconds: 120.0,
738        };
739
740        let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
741        let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
742
743        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
744        assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
745        assert_eq!(gradle.is_source, Some(true));
746        assert_eq!(gradle.is_script, Some(false));
747
748        assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
749        assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
750        assert_eq!(nix.is_source, Some(true));
751        assert_eq!(nix.is_script, Some(false));
752    }
753
754    #[test]
755    fn scanner_treats_gitmodules_as_text_not_source() {
756        let options = TextDetectionOptions {
757            collect_info: true,
758            detect_packages: false,
759            detect_application_packages: false,
760            detect_system_packages: false,
761            detect_packages_in_compiled: false,
762            detect_copyrights: false,
763            detect_generated: false,
764            detect_emails: false,
765            detect_urls: false,
766            max_emails: 50,
767            max_urls: 50,
768            timeout_seconds: 120.0,
769        };
770        let scanned = scan_file_at_relative_path(
771            ".gitmodules",
772            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
773            &options,
774        );
775
776        assert_eq!(scanned.programming_language, None);
777        assert_eq!(
778            scanned.file_type_label.as_deref(),
779            Some("Git configuration text")
780        );
781        assert_eq!(scanned.is_text, Some(true));
782        assert_eq!(scanned.is_source, Some(false));
783        assert_eq!(scanned.is_script, Some(false));
784    }
785
786    #[test]
787    fn scanner_treats_javascript_shebang_files_as_scripts() {
788        let options = TextDetectionOptions {
789            collect_info: true,
790            detect_packages: false,
791            detect_application_packages: false,
792            detect_system_packages: false,
793            detect_packages_in_compiled: false,
794            detect_copyrights: false,
795            detect_generated: false,
796            detect_emails: false,
797            detect_urls: false,
798            max_emails: 50,
799            max_urls: 50,
800            timeout_seconds: 120.0,
801        };
802        let scanned = scan_file_at_relative_path(
803            "bin/run",
804            b"#!/usr/bin/env node\nconsole.log('hello');\n",
805            &options,
806        );
807
808        assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
809        assert_eq!(
810            scanned.file_type_label.as_deref(),
811            Some("javascript script, UTF-8 Unicode text executable")
812        );
813        assert_eq!(scanned.is_script, Some(true));
814        assert_eq!(scanned.is_source, Some(true));
815    }
816
817    #[test]
818    fn scanner_treats_dockerfile_as_source() {
819        let options = TextDetectionOptions {
820            collect_info: true,
821            detect_packages: false,
822            detect_application_packages: false,
823            detect_system_packages: false,
824            detect_packages_in_compiled: false,
825            detect_copyrights: false,
826            detect_generated: false,
827            detect_emails: false,
828            detect_urls: false,
829            max_emails: 50,
830            max_urls: 50,
831            timeout_seconds: 120.0,
832        };
833        let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
834
835        assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
836        assert_eq!(
837            scanned.file_type_label.as_deref(),
838            Some("UTF-8 Unicode text")
839        );
840        assert_eq!(scanned.is_source, Some(true));
841        assert_eq!(scanned.is_script, Some(false));
842    }
843
844    #[test]
845    fn scanner_treats_makefile_as_text_not_source() {
846        let options = TextDetectionOptions {
847            collect_info: true,
848            detect_packages: false,
849            detect_application_packages: false,
850            detect_system_packages: false,
851            detect_packages_in_compiled: false,
852            detect_copyrights: false,
853            detect_generated: false,
854            detect_emails: false,
855            detect_urls: false,
856            max_emails: 50,
857            max_urls: 50,
858            timeout_seconds: 120.0,
859        };
860        let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
861
862        assert_eq!(scanned.programming_language, None);
863        assert_eq!(
864            scanned.file_type_label.as_deref(),
865            Some("UTF-8 Unicode text")
866        );
867        assert_eq!(scanned.is_text, Some(true));
868        assert_eq!(scanned.is_source, Some(false));
869        assert_eq!(scanned.is_script, Some(false));
870    }
871
872    #[test]
873    fn scanner_omits_info_surface_when_disabled() {
874        let options = TextDetectionOptions {
875            collect_info: false,
876            detect_packages: false,
877            detect_application_packages: false,
878            detect_system_packages: false,
879            detect_packages_in_compiled: false,
880            detect_copyrights: false,
881            detect_generated: false,
882            detect_emails: false,
883            detect_urls: false,
884            max_emails: 50,
885            max_urls: 50,
886            timeout_seconds: 120.0,
887        };
888        let scanned = scan_single_file(
889            "script.py",
890            "#!/usr/bin/env python3\nprint(\"hello\")\n",
891            &options,
892        );
893
894        assert!(scanned.sha1.is_none());
895        assert!(scanned.md5.is_none());
896        assert!(scanned.sha256.is_none());
897        assert!(scanned.sha1_git.is_none());
898        assert!(scanned.mime_type.is_none());
899        assert!(scanned.date.is_none());
900        assert!(scanned.programming_language.is_none());
901        assert!(scanned.is_binary.is_none());
902        assert!(scanned.is_text.is_none());
903        assert!(scanned.is_archive.is_none());
904        assert!(scanned.is_media.is_none());
905        assert!(scanned.is_script.is_none());
906        assert!(scanned.is_source.is_none());
907    }
908
909    #[test]
910    fn scanner_skips_package_parsing_when_disabled() {
911        let options = TextDetectionOptions {
912            collect_info: false,
913            detect_packages: false,
914            detect_application_packages: false,
915            detect_system_packages: false,
916            detect_packages_in_compiled: false,
917            detect_copyrights: false,
918            detect_generated: false,
919            detect_emails: false,
920            detect_urls: false,
921            max_emails: 50,
922            max_urls: 50,
923            timeout_seconds: 120.0,
924        };
925        let scanned = scan_single_file(
926            "package.json",
927            r#"{"name":"demo","version":"1.0.0"}"#,
928            &options,
929        );
930
931        assert!(
932            scanned.package_data.is_empty(),
933            "package_data: {:#?}",
934            scanned.package_data
935        );
936    }
937
938    #[test]
939    fn scanner_parses_package_manifests_when_enabled() {
940        let options = TextDetectionOptions {
941            collect_info: false,
942            detect_packages: true,
943            detect_application_packages: true,
944            detect_system_packages: false,
945            detect_packages_in_compiled: false,
946            detect_copyrights: false,
947            detect_generated: false,
948            detect_emails: false,
949            detect_urls: false,
950            max_emails: 50,
951            max_urls: 50,
952            timeout_seconds: 120.0,
953        };
954        let scanned = scan_single_file(
955            "package.json",
956            r#"{"name":"demo","version":"1.0.0"}"#,
957            &options,
958        );
959
960        assert_eq!(
961            scanned.package_data.len(),
962            1,
963            "package_data: {:#?}",
964            scanned.package_data
965        );
966    }
967
968    #[test]
969    fn scanner_skips_application_packages_when_only_system_packages_enabled() {
970        let options = TextDetectionOptions {
971            collect_info: false,
972            detect_packages: true,
973            detect_application_packages: false,
974            detect_system_packages: true,
975            detect_packages_in_compiled: false,
976            detect_copyrights: false,
977            detect_generated: false,
978            detect_emails: false,
979            detect_urls: false,
980            max_emails: 50,
981            max_urls: 50,
982            timeout_seconds: 120.0,
983        };
984        let scanned = scan_single_file(
985            "package.json",
986            r#"{"name":"demo","version":"1.0.0"}"#,
987            &options,
988        );
989
990        assert!(
991            scanned.package_data.is_empty(),
992            "package_data: {:#?}",
993            scanned.package_data
994        );
995    }
996
997    #[test]
998    fn scanner_parses_system_package_files_when_enabled() {
999        let options = TextDetectionOptions {
1000            collect_info: false,
1001            detect_packages: true,
1002            detect_application_packages: false,
1003            detect_system_packages: true,
1004            detect_packages_in_compiled: false,
1005            detect_copyrights: false,
1006            detect_generated: false,
1007            detect_emails: false,
1008            detect_urls: false,
1009            max_emails: 50,
1010            max_urls: 50,
1011            timeout_seconds: 120.0,
1012        };
1013        let scanned = scan_file_at_relative_path(
1014            "var/lib/dpkg/status",
1015            b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1016            &options,
1017        );
1018
1019        assert!(
1020            !scanned.package_data.is_empty(),
1021            "package_data: {:#?}",
1022            scanned.package_data
1023        );
1024    }
1025
1026    #[test]
1027    fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1028        let temp_dir = TempDir::new().expect("create temp dir");
1029        fs::write(
1030            temp_dir.path().join("go.mod"),
1031            "module example.com/demo\n\ngo 1.23.0\n",
1032        )
1033        .expect("write go.mod");
1034        fs::write(
1035            temp_dir.path().join("main.go"),
1036            "package main\nfunc main() {}\n",
1037        )
1038        .expect("write main.go");
1039        let file_path = temp_dir.path().join("demo");
1040        let status = std::process::Command::new("go")
1041            .current_dir(temp_dir.path())
1042            .args(["build", "-o"])
1043            .arg(&file_path)
1044            .status()
1045            .expect("run go build");
1046        assert!(status.success());
1047
1048        let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1049        let collected = collect_paths(temp_dir.path(), 0, &[]);
1050
1051        let without_compiled = process_collected(
1052            &collected,
1053            Arc::clone(&progress),
1054            None,
1055            LicenseScanOptions::default(),
1056            &TextDetectionOptions {
1057                collect_info: false,
1058                detect_packages: true,
1059                detect_application_packages: true,
1060                detect_system_packages: false,
1061                detect_packages_in_compiled: false,
1062                detect_copyrights: false,
1063                detect_generated: false,
1064                detect_emails: false,
1065                detect_urls: false,
1066                max_emails: 50,
1067                max_urls: 50,
1068                timeout_seconds: 120.0,
1069            },
1070        );
1071        let with_compiled = process_collected(
1072            &collected,
1073            progress,
1074            None,
1075            LicenseScanOptions::default(),
1076            &TextDetectionOptions {
1077                collect_info: false,
1078                detect_packages: true,
1079                detect_application_packages: true,
1080                detect_system_packages: false,
1081                detect_packages_in_compiled: true,
1082                detect_copyrights: false,
1083                detect_generated: false,
1084                detect_emails: false,
1085                detect_urls: false,
1086                max_emails: 50,
1087                max_urls: 50,
1088                timeout_seconds: 120.0,
1089            },
1090        );
1091
1092        let without_compiled = without_compiled
1093            .files
1094            .into_iter()
1095            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1096            .expect("compiled artifact present");
1097        let with_compiled = with_compiled
1098            .files
1099            .into_iter()
1100            .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1101            .expect("compiled artifact present");
1102
1103        assert!(
1104            without_compiled.package_data.is_empty(),
1105            "package_data: {:#?}",
1106            without_compiled.package_data
1107        );
1108        assert!(!with_compiled.package_data.is_empty());
1109    }
1110
1111    #[test]
1112    fn scanner_sets_is_source_only_when_info_enabled() {
1113        let without_info = TextDetectionOptions {
1114            collect_info: false,
1115            detect_packages: false,
1116            detect_application_packages: false,
1117            detect_system_packages: false,
1118            detect_packages_in_compiled: false,
1119            detect_copyrights: false,
1120            detect_generated: false,
1121            detect_emails: false,
1122            detect_urls: false,
1123            max_emails: 50,
1124            max_urls: 50,
1125            timeout_seconds: 120.0,
1126        };
1127        let with_info = TextDetectionOptions {
1128            collect_info: true,
1129            ..without_info.clone()
1130        };
1131
1132        let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1133        let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1134
1135        assert_eq!(scanned_without_info.is_source, None);
1136        assert_eq!(scanned_with_info.is_source, Some(true));
1137    }
1138
1139    #[test]
1140    fn directory_omits_info_fields_when_info_disabled() {
1141        let temp_dir = TempDir::new().expect("create temp dir");
1142        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1143
1144        let collected = collect_paths(temp_dir.path(), 0, &[]);
1145        let result = process_collected(
1146            &collected,
1147            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1148            None,
1149            LicenseScanOptions::default(),
1150            &TextDetectionOptions {
1151                collect_info: false,
1152                detect_packages: false,
1153                detect_application_packages: false,
1154                detect_system_packages: false,
1155                detect_packages_in_compiled: false,
1156                detect_copyrights: false,
1157                detect_generated: false,
1158                detect_emails: false,
1159                detect_urls: false,
1160                max_emails: 50,
1161                max_urls: 50,
1162                timeout_seconds: 120.0,
1163            },
1164        );
1165
1166        let directory = result
1167            .files
1168            .into_iter()
1169            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1170            .expect("directory entry");
1171
1172        assert!(directory.date.is_none());
1173        assert!(directory.file_type_label.is_none());
1174        assert!(directory.is_binary.is_none());
1175        assert!(directory.is_text.is_none());
1176        assert!(directory.is_archive.is_none());
1177        assert!(directory.is_media.is_none());
1178        assert!(directory.is_source.is_none());
1179        assert!(directory.is_script.is_none());
1180    }
1181
1182    #[test]
1183    fn directory_includes_info_fields_when_info_enabled() {
1184        let temp_dir = TempDir::new().expect("create temp dir");
1185        fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1186
1187        let collected = collect_paths(temp_dir.path(), 0, &[]);
1188        let result = process_collected(
1189            &collected,
1190            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1191            None,
1192            LicenseScanOptions::default(),
1193            &TextDetectionOptions {
1194                collect_info: true,
1195                detect_packages: false,
1196                detect_application_packages: false,
1197                detect_system_packages: false,
1198                detect_packages_in_compiled: false,
1199                detect_copyrights: false,
1200                detect_generated: false,
1201                detect_emails: false,
1202                detect_urls: false,
1203                max_emails: 50,
1204                max_urls: 50,
1205                timeout_seconds: 120.0,
1206            },
1207        );
1208
1209        let directory = result
1210            .files
1211            .into_iter()
1212            .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1213            .expect("directory entry");
1214
1215        assert!(directory.date.is_none());
1216        assert!(directory.file_type_label.is_none());
1217        assert_eq!(directory.is_binary, Some(false));
1218        assert_eq!(directory.is_text, Some(false));
1219        assert_eq!(directory.is_archive, Some(false));
1220        assert_eq!(directory.is_media, Some(false));
1221        assert_eq!(directory.is_source, Some(false));
1222        assert_eq!(directory.is_script, Some(false));
1223        assert_eq!(directory.files_count, Some(0));
1224        assert_eq!(directory.dirs_count, Some(0));
1225        assert_eq!(directory.size_count, Some(0));
1226    }
1227
1228    #[test]
1229    fn collect_paths_includes_root_directory_entry() {
1230        let temp_dir = TempDir::new().expect("create temp dir");
1231        fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1232        fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1233            .expect("write nested file");
1234
1235        let collected = collect_paths(temp_dir.path(), 0, &[]);
1236
1237        assert!(
1238            collected
1239                .directories
1240                .iter()
1241                .any(|(path, _)| path == temp_dir.path())
1242        );
1243    }
1244
1245    #[test]
1246    fn collect_paths_supports_single_file_input() {
1247        let temp_dir = TempDir::new().expect("create temp dir");
1248        let file_path = temp_dir.path().join("main.rs");
1249        fs::write(&file_path, "fn main() {}\n").expect("write file");
1250
1251        let collected = collect_paths(&file_path, 0, &[]);
1252
1253        assert_eq!(collected.files.len(), 1);
1254        assert!(collected.directories.is_empty());
1255        assert_eq!(collected.files[0].0, file_path);
1256    }
1257
1258    #[test]
1259    fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1260        let temp_dir = TempDir::new().expect("create temp dir");
1261        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1262        fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1263
1264        let collected = collect_paths(temp_dir.path(), 0, &[]);
1265        let result = process_collected_with_memory_limit(
1266            &collected,
1267            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1268            None,
1269            LicenseScanOptions::default(),
1270            &TextDetectionOptions {
1271                collect_info: false,
1272                detect_packages: false,
1273                detect_application_packages: false,
1274                detect_system_packages: false,
1275                detect_packages_in_compiled: false,
1276                detect_copyrights: false,
1277                detect_generated: false,
1278                detect_emails: false,
1279                detect_urls: false,
1280                max_emails: 50,
1281                max_urls: 50,
1282                timeout_seconds: 120.0,
1283            },
1284            1,
1285        );
1286
1287        assert_eq!(result.files.len(), 3);
1288    }
1289
1290    #[test]
1291    fn process_collected_with_negative_one_uses_disk_only_mode() {
1292        let temp_dir = TempDir::new().expect("create temp dir");
1293        fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1294
1295        let collected = collect_paths(temp_dir.path(), 0, &[]);
1296        let result = process_collected_with_memory_limit(
1297            &collected,
1298            Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1299            None,
1300            LicenseScanOptions::default(),
1301            &TextDetectionOptions {
1302                collect_info: false,
1303                detect_packages: false,
1304                detect_application_packages: false,
1305                detect_system_packages: false,
1306                detect_packages_in_compiled: false,
1307                detect_copyrights: false,
1308                detect_generated: false,
1309                detect_emails: false,
1310                detect_urls: false,
1311                max_emails: 50,
1312                max_urls: 50,
1313                timeout_seconds: 120.0,
1314            },
1315            -1,
1316        );
1317
1318        assert_eq!(result.files.len(), 2);
1319    }
1320}