1mod collect;
5mod process;
6
7use crate::license_detection::LicenseDetectionEngine;
8use crate::models::FileInfo;
9
10pub struct ProcessResult {
11 pub files: Vec<FileInfo>,
12 pub excluded_count: usize,
13}
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct LicenseScanOptions {
17 pub include_text: bool,
18 pub include_text_diagnostics: bool,
19 pub include_diagnostics: bool,
20 pub unknown_licenses: bool,
21 pub min_score: u8,
22}
23
24#[derive(Debug, Clone)]
25pub struct TextDetectionOptions {
26 pub collect_info: bool,
27 pub detect_packages: bool,
28 pub detect_application_packages: bool,
29 pub detect_system_packages: bool,
30 pub detect_packages_in_compiled: bool,
31 pub detect_copyrights: bool,
32 pub detect_generated: bool,
33 pub detect_emails: bool,
34 pub detect_urls: bool,
35 pub max_emails: usize,
36 pub max_urls: usize,
37 pub timeout_seconds: f64,
38}
39
40impl Default for TextDetectionOptions {
41 fn default() -> Self {
42 Self {
43 collect_info: false,
44 detect_packages: false,
45 detect_application_packages: false,
46 detect_system_packages: false,
47 detect_packages_in_compiled: false,
48 detect_copyrights: true,
49 detect_generated: false,
50 detect_emails: false,
51 detect_urls: false,
52 max_emails: 50,
53 max_urls: 50,
54 timeout_seconds: 120.0,
55 }
56 }
57}
58
59pub fn scan_options_fingerprint(
60 text_options: &TextDetectionOptions,
61 license_options: LicenseScanOptions,
62 license_engine: Option<&LicenseDetectionEngine>,
63) -> String {
64 let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
65 Some(engine) => {
66 let rules = &engine.index().rules_by_rid;
67 (
68 true,
69 rules.len(),
70 rules
71 .first()
72 .map(|rule| rule.identifier.as_str())
73 .unwrap_or(""),
74 rules
75 .last()
76 .map(|rule| rule.identifier.as_str())
77 .unwrap_or(""),
78 )
79 }
80 None => (false, 0, "", ""),
81 };
82
83 format!(
84 "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
85 crate::version::BUILD_VERSION,
86 text_options.collect_info,
87 text_options.detect_packages,
88 text_options.detect_application_packages,
89 text_options.detect_system_packages,
90 text_options.detect_packages_in_compiled,
91 text_options.detect_copyrights,
92 text_options.detect_generated,
93 text_options.detect_emails,
94 text_options.detect_urls,
95 text_options.max_emails,
96 text_options.max_urls,
97 text_options.timeout_seconds,
98 license_enabled,
99 rules_count,
100 first_rule_id,
101 last_rule_id,
102 license_options.include_text,
103 license_options.include_text_diagnostics,
104 license_options.include_diagnostics,
105 license_options.unknown_licenses,
106 license_options.min_score,
107 )
108}
109
110pub use self::collect::{
111 CollectedPaths, CollectionFrontier, collect_paths, collect_selected_paths,
112};
113#[allow(unused_imports)]
114pub use self::process::{
115 MemoryMode, process_collected, process_collected_sequential,
116 process_collected_with_memory_limit, process_collected_with_memory_limit_sequential,
117};
118
119#[cfg(test)]
120mod tests {
121 use std::fs;
122 use std::path::PathBuf;
123 use std::sync::Arc;
124
125 use tempfile::TempDir;
126
127 use crate::cache::build_collection_exclude_patterns;
128 use crate::license_detection::LicenseDetectionEngine;
129 use crate::models::{DatasourceId, FileType, PackageType as FilePackageType};
130 use crate::progress::{ProgressMode, ScanProgress};
131
132 use super::{
133 CollectionFrontier, LicenseScanOptions, MemoryMode, TextDetectionOptions, collect_paths,
134 collect_selected_paths, process_collected, process_collected_with_memory_limit,
135 scan_options_fingerprint,
136 };
137
138 fn build_sparse_oversized_rpm_with_filename(
139 temp_dir: &TempDir,
140 package_name: &str,
141 filename: &str,
142 ) -> PathBuf {
143 let file_path = temp_dir.path().join(filename);
144 rpm::PackageBuilder::new(package_name, "1.0", "MIT", "x86_64", "Demo RPM package")
145 .release("1")
146 .build()
147 .expect("build rpm fixture")
148 .write_file(&file_path)
149 .expect("write rpm fixture");
150 fs::OpenOptions::new()
151 .write(true)
152 .open(&file_path)
153 .expect("open rpm fixture for sparse extension")
154 .set_len(100 * 1024 * 1024 + 1_048_576)
155 .expect("extend rpm fixture");
156 file_path
157 }
158
159 fn build_sparse_oversized_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
160 build_sparse_oversized_rpm_with_filename(
161 temp_dir,
162 name,
163 &format!("{name}-1.0-1.x86_64.rpm"),
164 )
165 }
166
167 fn build_sparse_oversized_pack_rpm(temp_dir: &TempDir, name: &str) -> PathBuf {
168 build_sparse_oversized_rpm_with_filename(
169 temp_dir,
170 name,
171 &format!("{name}-1.0-1.x86_64.pack"),
172 )
173 }
174
175 #[test]
176 fn default_options_keep_copyright_detection_enabled() {
177 let options = TextDetectionOptions::default();
178 assert!(!options.detect_packages);
179 assert!(options.detect_copyrights);
180 }
181
182 #[test]
183 fn test_scan_options_fingerprint_changes_with_license_score() {
184 let text_options = TextDetectionOptions::default();
185 let default_fingerprint = scan_options_fingerprint(
186 &text_options,
187 LicenseScanOptions {
188 min_score: 0,
189 ..LicenseScanOptions::default()
190 },
191 None,
192 );
193 let filtered_fingerprint = scan_options_fingerprint(
194 &text_options,
195 LicenseScanOptions {
196 min_score: 70,
197 ..LicenseScanOptions::default()
198 },
199 None,
200 );
201
202 assert_ne!(default_fingerprint, filtered_fingerprint);
203 }
204
205 fn scan_single_file(
206 file_name: &str,
207 content: &str,
208 options: &TextDetectionOptions,
209 ) -> crate::models::FileInfo {
210 let temp_dir = TempDir::new().expect("create temp dir");
211 let file_path = temp_dir.path().join(file_name);
212 fs::write(&file_path, content).expect("write test file");
213
214 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
215 let collected = collect_paths(temp_dir.path(), 0, &[]);
216 let result = process_collected(
217 &collected,
218 progress,
219 None,
220 LicenseScanOptions::default(),
221 options,
222 );
223
224 result
225 .files
226 .into_iter()
227 .find(|entry| {
228 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
229 })
230 .expect("scanned file entry")
231 }
232
233 fn scan_file_at_relative_path(
234 relative_path: &str,
235 content: &[u8],
236 options: &TextDetectionOptions,
237 ) -> crate::models::FileInfo {
238 let temp_dir = TempDir::new().expect("create temp dir");
239 let file_path = temp_dir.path().join(relative_path);
240 if let Some(parent) = file_path.parent() {
241 fs::create_dir_all(parent).expect("create parent dirs");
242 }
243 fs::write(&file_path, content).expect("write test file");
244
245 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
246 let collected = collect_paths(temp_dir.path(), 0, &[]);
247 let result = process_collected(
248 &collected,
249 progress,
250 None,
251 LicenseScanOptions::default(),
252 options,
253 );
254
255 result
256 .files
257 .into_iter()
258 .find(|entry| {
259 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
260 })
261 .expect("scanned file entry")
262 }
263
264 fn scan_single_file_with_license_engine(
265 file_name: &str,
266 content: &str,
267 options: &TextDetectionOptions,
268 ) -> crate::models::FileInfo {
269 let temp_dir = TempDir::new().expect("create temp dir");
270 let file_path = temp_dir.path().join(file_name);
271 fs::write(&file_path, content).expect("write test file");
272
273 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
274 let collected = collect_paths(temp_dir.path(), 0, &[]);
275 let engine =
276 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
277 let result = process_collected(
278 &collected,
279 progress,
280 Some(engine),
281 LicenseScanOptions::default(),
282 options,
283 );
284
285 result
286 .files
287 .into_iter()
288 .find(|entry| {
289 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
290 })
291 .expect("scanned file entry")
292 }
293
294 #[test]
295 fn scanner_reports_repeated_email_occurrences() {
296 let options = TextDetectionOptions {
297 collect_info: false,
298 detect_packages: false,
299 detect_application_packages: false,
300 detect_system_packages: false,
301 detect_packages_in_compiled: false,
302 detect_copyrights: false,
303 detect_generated: false,
304 detect_emails: true,
305 detect_urls: false,
306 max_emails: 50,
307 max_urls: 50,
308 timeout_seconds: 120.0,
309 };
310 let scanned = scan_single_file(
311 "contacts.txt",
312 "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
313 &options,
314 );
315
316 let emails: Vec<(&str, usize)> = scanned
317 .emails
318 .iter()
319 .map(|email| (email.email.as_str(), email.start_line.get()))
320 .collect();
321
322 assert_eq!(emails.len(), 4, "emails: {emails:#?}");
323 assert_eq!(
324 emails,
325 vec![
326 ("linux@3ware.com", 1),
327 ("linux@3ware.com", 2),
328 ("andre@suse.com", 3),
329 ("linux@3ware.com", 4),
330 ]
331 );
332 }
333
334 #[test]
335 fn scanner_skips_pem_certificate_text_detection() {
336 let options = TextDetectionOptions {
337 collect_info: false,
338 detect_packages: false,
339 detect_application_packages: false,
340 detect_system_packages: false,
341 detect_packages_in_compiled: false,
342 detect_copyrights: true,
343 detect_generated: false,
344 detect_emails: true,
345 detect_urls: true,
346 max_emails: 50,
347 max_urls: 50,
348 timeout_seconds: 120.0,
349 };
350 let pem_fixture = concat!(
351 "-----BEGIN CERTIFICATE-----\n",
352 "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
353 "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
354 "-----END CERTIFICATE-----\n",
355 "Certificate:\n",
356 " Data:\n",
357 " Signature Algorithm: sha1WithRSAEncryption\n",
358 " Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
359 " Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
360 " Contact: cert-owner@example.com\n",
361 );
362 let scanned = scan_single_file("cert.pem", pem_fixture, &options);
363
364 assert!(
365 scanned.copyrights.is_empty(),
366 "copyrights: {:#?}",
367 scanned.copyrights
368 );
369 assert!(
370 scanned.holders.is_empty(),
371 "holders: {:#?}",
372 scanned.holders
373 );
374 assert!(
375 scanned.authors.is_empty(),
376 "authors: {:#?}",
377 scanned.authors
378 );
379 assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
380 assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
381 assert!(
382 scanned.license_detections.is_empty(),
383 "licenses: {:#?}",
384 scanned.license_detections
385 );
386 assert!(
387 scanned.license_clues.is_empty(),
388 "license clues: {:#?}",
389 scanned.license_clues
390 );
391 }
392
393 #[test]
394 fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
395 let options = TextDetectionOptions {
396 collect_info: false,
397 detect_packages: false,
398 detect_application_packages: false,
399 detect_system_packages: false,
400 detect_packages_in_compiled: false,
401 detect_copyrights: true,
402 detect_generated: false,
403 detect_emails: false,
404 detect_urls: true,
405 max_emails: 50,
406 max_urls: 50,
407 timeout_seconds: 120.0,
408 };
409 let fixture = concat!(
410 "/*\n",
411 "Copyright 2022 The Kubernetes Authors.\n\n",
412 "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
413 "you may not use this file except in compliance with the License.\n",
414 "You may obtain a copy of the License at\n\n",
415 " http://www.apache.org/licenses/LICENSE-2.0\n",
416 "*/\n\n",
417 "package storage\n\n",
418 "const validCert = `\n",
419 "-----BEGIN CERTIFICATE-----\n",
420 "MIIDmTCCAoGgAwIBAgIUWQ==\n",
421 "-----END CERTIFICATE-----\n",
422 "`\n",
423 );
424 let temp_dir = TempDir::new().expect("create temp dir");
425 let file_path = temp_dir.path().join("storage_test.go");
426 fs::write(&file_path, fixture).expect("write fixture");
427
428 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
429 let collected = collect_paths(temp_dir.path(), 0, &[]);
430 let engine =
431 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
432 let result = process_collected(
433 &collected,
434 progress,
435 Some(engine),
436 LicenseScanOptions::default(),
437 &options,
438 );
439 let scanned = result
440 .files
441 .into_iter()
442 .find(|entry| {
443 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
444 })
445 .expect("scanned file entry");
446
447 assert!(
448 scanned
449 .copyrights
450 .iter()
451 .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
452 "copyrights: {:#?}",
453 scanned.copyrights
454 );
455 assert!(
456 scanned
457 .holders
458 .iter()
459 .any(|h| h.holder == "The Kubernetes Authors"),
460 "holders: {:#?}",
461 scanned.holders
462 );
463 assert!(
464 scanned
465 .urls
466 .iter()
467 .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
468 "urls: {:#?}",
469 scanned.urls
470 );
471 assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
472 }
473
474 #[test]
475 fn scanner_detects_structured_credits_authors() {
476 let options = TextDetectionOptions {
477 collect_info: false,
478 detect_packages: false,
479 detect_application_packages: false,
480 detect_system_packages: false,
481 detect_packages_in_compiled: false,
482 detect_copyrights: true,
483 detect_generated: false,
484 detect_emails: false,
485 detect_urls: false,
486 max_emails: 50,
487 max_urls: 50,
488 timeout_seconds: 120.0,
489 };
490 let credits_fixture = concat!(
491 "N: Jack Lloyd\n",
492 "E: lloyd@randombit.net\n",
493 "W: http://www.randombit.net/\n",
494 );
495 let scanned = scan_single_file("CREDITS", credits_fixture, &options);
496
497 let authors: Vec<(&str, usize, usize)> = scanned
498 .authors
499 .iter()
500 .map(|author| {
501 (
502 author.author.as_str(),
503 author.start_line.get(),
504 author.end_line.get(),
505 )
506 })
507 .collect();
508
509 assert_eq!(
510 authors,
511 vec![(
512 "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
513 1,
514 3,
515 )]
516 );
517 assert!(scanned.copyrights.is_empty());
518 assert!(scanned.holders.is_empty());
519 }
520
521 #[test]
522 fn scanner_uses_or_for_alternative_license_header() {
523 let fixture =
524 include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
525 let temp_dir = TempDir::new().expect("create temp dir");
526 let file_path = temp_dir.path().join("d2s.ipp");
527 fs::write(&file_path, fixture).expect("write fixture");
528
529 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
530 let collected = collect_paths(temp_dir.path(), 0, &[]);
531 let engine =
532 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
533 let result = process_collected(
534 &collected,
535 progress,
536 Some(engine),
537 LicenseScanOptions::default(),
538 &TextDetectionOptions::default(),
539 );
540 let scanned = result
541 .files
542 .into_iter()
543 .find(|entry| {
544 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
545 })
546 .expect("scanned file entry");
547
548 assert_eq!(
549 scanned.license_expression.as_deref(),
550 Some("Apache-2.0 OR BSL-1.0")
551 );
552 assert!(
553 scanned.license_clues.is_empty(),
554 "license clues: {:#?}",
555 scanned.license_clues
556 );
557 assert_eq!(
558 scanned.license_detections.len(),
559 1,
560 "detections: {:#?}",
561 scanned.license_detections
562 );
563
564 let detection = &scanned.license_detections[0];
565 assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
566
567 let match_expressions: Vec<_> = detection
568 .matches
569 .iter()
570 .map(|m| m.license_expression_spdx.as_str())
571 .collect();
572 assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
573 }
574
575 #[test]
576 fn scanner_sets_generated_flag_when_enabled() {
577 let options = TextDetectionOptions {
578 collect_info: false,
579 detect_packages: false,
580 detect_application_packages: false,
581 detect_system_packages: false,
582 detect_packages_in_compiled: false,
583 detect_copyrights: false,
584 detect_generated: true,
585 detect_emails: false,
586 detect_urls: false,
587 max_emails: 50,
588 max_urls: 50,
589 timeout_seconds: 120.0,
590 };
591 let scanned = scan_single_file(
592 "generated.c",
593 "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
594 &options,
595 );
596
597 assert_eq!(scanned.is_generated, Some(true));
598 }
599
600 #[test]
601 fn scanner_leaves_generated_flag_unset_when_disabled() {
602 let options = TextDetectionOptions {
603 collect_info: false,
604 detect_packages: false,
605 detect_application_packages: false,
606 detect_system_packages: false,
607 detect_packages_in_compiled: false,
608 detect_copyrights: false,
609 detect_generated: false,
610 detect_emails: false,
611 detect_urls: false,
612 max_emails: 50,
613 max_urls: 50,
614 timeout_seconds: 120.0,
615 };
616 let scanned = scan_single_file(
617 "generated.c",
618 "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
619 &options,
620 );
621
622 assert_eq!(scanned.is_generated, None);
623 }
624
625 #[test]
626 fn scanner_populates_info_surface_when_enabled() {
627 let options = TextDetectionOptions {
628 collect_info: true,
629 detect_packages: false,
630 detect_application_packages: false,
631 detect_system_packages: false,
632 detect_packages_in_compiled: false,
633 detect_copyrights: false,
634 detect_generated: false,
635 detect_emails: false,
636 detect_urls: false,
637 max_emails: 50,
638 max_urls: 50,
639 timeout_seconds: 120.0,
640 };
641 let scanned = scan_single_file(
642 "script.py",
643 "#!/usr/bin/env python3\nprint(\"hello\")\n",
644 &options,
645 );
646
647 assert!(scanned.sha1.is_some());
648 assert!(scanned.md5.is_some());
649 assert!(scanned.sha256.is_some());
650 assert!(scanned.sha1_git.is_some());
651 assert!(scanned.mime_type.is_some());
652 assert!(scanned.date.is_some());
653 assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
654 assert_eq!(scanned.is_text, Some(true));
655 assert_eq!(scanned.is_script, Some(true));
656 assert_eq!(scanned.is_source, Some(true));
657 }
658
659 #[test]
660 fn scanner_treats_latin1_python_sources_as_textual_scripts() {
661 let options = TextDetectionOptions {
662 collect_info: true,
663 detect_packages: false,
664 detect_application_packages: false,
665 detect_system_packages: false,
666 detect_packages_in_compiled: false,
667 detect_copyrights: false,
668 detect_generated: false,
669 detect_emails: false,
670 detect_urls: false,
671 max_emails: 50,
672 max_urls: 50,
673 timeout_seconds: 120.0,
674 };
675 let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
676 let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
677
678 assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
679 assert_eq!(
680 scanned.file_type_label.as_deref(),
681 Some("python script, text executable")
682 );
683 assert_eq!(scanned.is_binary, Some(false));
684 assert_eq!(scanned.is_text, Some(true));
685 assert_eq!(scanned.is_script, Some(true));
686 assert_eq!(scanned.is_source, Some(true));
687 }
688
689 #[test]
690 fn scanner_skips_findings_for_zip_like_archives() {
691 let options = TextDetectionOptions {
692 collect_info: true,
693 detect_packages: false,
694 detect_application_packages: false,
695 detect_system_packages: false,
696 detect_packages_in_compiled: false,
697 detect_copyrights: true,
698 detect_generated: false,
699 detect_emails: true,
700 detect_urls: true,
701 max_emails: 50,
702 max_urls: 50,
703 timeout_seconds: 120.0,
704 };
705 let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
706 let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
707
708 assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
709 assert_eq!(scanned.is_archive, Some(true));
710 assert!(scanned.license_detections.is_empty());
711 assert!(scanned.copyrights.is_empty());
712 assert!(scanned.emails.is_empty());
713 assert!(scanned.urls.is_empty());
714 }
715
716 #[test]
717 fn scanner_treats_typescript_sources_as_text_not_video_media() {
718 let options = TextDetectionOptions {
719 collect_info: true,
720 detect_packages: false,
721 detect_application_packages: false,
722 detect_system_packages: false,
723 detect_packages_in_compiled: false,
724 detect_copyrights: false,
725 detect_generated: false,
726 detect_emails: false,
727 detect_urls: false,
728 max_emails: 50,
729 max_urls: 50,
730 timeout_seconds: 120.0,
731 };
732 let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
733
734 assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
735 assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
736 assert_eq!(
737 scanned.file_type_label.as_deref(),
738 Some("UTF-8 Unicode text")
739 );
740 assert_eq!(scanned.is_text, Some(true));
741 assert_eq!(scanned.is_media, Some(false));
742 assert_eq!(scanned.is_script, Some(false));
743 assert_eq!(scanned.is_source, Some(true));
744 }
745
746 #[test]
747 fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
748 let options = TextDetectionOptions {
749 collect_info: true,
750 detect_packages: false,
751 detect_application_packages: false,
752 detect_system_packages: false,
753 detect_packages_in_compiled: false,
754 detect_copyrights: false,
755 detect_generated: false,
756 detect_emails: false,
757 detect_urls: false,
758 max_emails: 50,
759 max_urls: 50,
760 timeout_seconds: 120.0,
761 };
762 let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
763
764 assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
765 assert_eq!(
766 scanned.file_type_label.as_deref(),
767 Some("UTF-8 Unicode text")
768 );
769 assert_eq!(scanned.is_text, Some(true));
770 assert_eq!(scanned.is_media, Some(false));
771 assert_eq!(scanned.is_script, Some(false));
772 assert_eq!(scanned.is_source, Some(true));
773 }
774
775 #[test]
776 fn scanner_treats_empty_files_like_scancode_info_surface() {
777 let options = TextDetectionOptions {
778 collect_info: true,
779 detect_packages: false,
780 detect_application_packages: false,
781 detect_system_packages: false,
782 detect_packages_in_compiled: false,
783 detect_copyrights: false,
784 detect_generated: false,
785 detect_emails: false,
786 detect_urls: false,
787 max_emails: 50,
788 max_urls: 50,
789 timeout_seconds: 120.0,
790 };
791 let scanned = scan_single_file("test.txt", "", &options);
792
793 assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
794 assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
795 assert_eq!(scanned.programming_language, None);
796 assert_eq!(scanned.is_binary, Some(false));
797 assert_eq!(scanned.is_text, Some(true));
798 assert_eq!(scanned.is_archive, Some(false));
799 assert_eq!(scanned.is_media, Some(false));
800 assert_eq!(scanned.is_source, Some(false));
801 assert_eq!(scanned.is_script, Some(false));
802 }
803
804 #[test]
805 fn scanner_treats_package_json_as_text_not_source() {
806 let options = TextDetectionOptions {
807 collect_info: true,
808 detect_packages: false,
809 detect_application_packages: false,
810 detect_system_packages: false,
811 detect_packages_in_compiled: false,
812 detect_copyrights: false,
813 detect_generated: false,
814 detect_emails: false,
815 detect_urls: false,
816 max_emails: 50,
817 max_urls: 50,
818 timeout_seconds: 120.0,
819 };
820 let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
821
822 assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
823 assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
824 assert_eq!(scanned.programming_language, None);
825 assert_eq!(scanned.is_text, Some(true));
826 assert_eq!(scanned.is_source, Some(false));
827 assert_eq!(scanned.is_script, Some(false));
828 }
829
830 #[test]
831 fn scanner_classifies_gradle_and_nix_manifests_as_source() {
832 let options = TextDetectionOptions {
833 collect_info: true,
834 detect_packages: false,
835 detect_application_packages: false,
836 detect_system_packages: false,
837 detect_packages_in_compiled: false,
838 detect_copyrights: false,
839 detect_generated: false,
840 detect_emails: false,
841 detect_urls: false,
842 max_emails: 50,
843 max_urls: 50,
844 timeout_seconds: 120.0,
845 };
846
847 let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
848 let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
849
850 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
851 assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
852 assert_eq!(gradle.is_source, Some(true));
853 assert_eq!(gradle.is_script, Some(false));
854
855 assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
856 assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
857 assert_eq!(nix.is_source, Some(true));
858 assert_eq!(nix.is_script, Some(false));
859 }
860
861 #[test]
862 fn scanner_treats_gitmodules_as_text_not_source() {
863 let options = TextDetectionOptions {
864 collect_info: true,
865 detect_packages: false,
866 detect_application_packages: false,
867 detect_system_packages: false,
868 detect_packages_in_compiled: false,
869 detect_copyrights: false,
870 detect_generated: false,
871 detect_emails: false,
872 detect_urls: false,
873 max_emails: 50,
874 max_urls: 50,
875 timeout_seconds: 120.0,
876 };
877 let scanned = scan_file_at_relative_path(
878 ".gitmodules",
879 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
880 &options,
881 );
882
883 assert_eq!(scanned.programming_language, None);
884 assert_eq!(
885 scanned.file_type_label.as_deref(),
886 Some("Git configuration text")
887 );
888 assert_eq!(scanned.is_text, Some(true));
889 assert_eq!(scanned.is_source, Some(false));
890 assert_eq!(scanned.is_script, Some(false));
891 }
892
893 #[test]
894 fn scanner_treats_javascript_shebang_files_as_scripts() {
895 let options = TextDetectionOptions {
896 collect_info: true,
897 detect_packages: false,
898 detect_application_packages: false,
899 detect_system_packages: false,
900 detect_packages_in_compiled: false,
901 detect_copyrights: false,
902 detect_generated: false,
903 detect_emails: false,
904 detect_urls: false,
905 max_emails: 50,
906 max_urls: 50,
907 timeout_seconds: 120.0,
908 };
909 let scanned = scan_file_at_relative_path(
910 "bin/run",
911 b"#!/usr/bin/env node\nconsole.log('hello');\n",
912 &options,
913 );
914
915 assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
916 assert_eq!(
917 scanned.file_type_label.as_deref(),
918 Some("javascript script, UTF-8 Unicode text executable")
919 );
920 assert_eq!(scanned.is_script, Some(true));
921 assert_eq!(scanned.is_source, Some(true));
922 }
923
924 #[test]
925 fn scanner_treats_dockerfile_as_source() {
926 let options = TextDetectionOptions {
927 collect_info: true,
928 detect_packages: false,
929 detect_application_packages: false,
930 detect_system_packages: false,
931 detect_packages_in_compiled: false,
932 detect_copyrights: false,
933 detect_generated: false,
934 detect_emails: false,
935 detect_urls: false,
936 max_emails: 50,
937 max_urls: 50,
938 timeout_seconds: 120.0,
939 };
940 let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
941
942 assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
943 assert_eq!(
944 scanned.file_type_label.as_deref(),
945 Some("UTF-8 Unicode text")
946 );
947 assert_eq!(scanned.is_source, Some(true));
948 assert_eq!(scanned.is_script, Some(false));
949 }
950
951 #[test]
952 fn scanner_treats_makefile_as_text_not_source() {
953 let options = TextDetectionOptions {
954 collect_info: true,
955 detect_packages: false,
956 detect_application_packages: false,
957 detect_system_packages: false,
958 detect_packages_in_compiled: false,
959 detect_copyrights: false,
960 detect_generated: false,
961 detect_emails: false,
962 detect_urls: false,
963 max_emails: 50,
964 max_urls: 50,
965 timeout_seconds: 120.0,
966 };
967 let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
968
969 assert_eq!(scanned.programming_language, None);
970 assert_eq!(
971 scanned.file_type_label.as_deref(),
972 Some("UTF-8 Unicode text")
973 );
974 assert_eq!(scanned.is_text, Some(true));
975 assert_eq!(scanned.is_source, Some(false));
976 assert_eq!(scanned.is_script, Some(false));
977 }
978
979 #[test]
980 fn scanner_omits_info_surface_when_disabled() {
981 let options = TextDetectionOptions {
982 collect_info: false,
983 detect_packages: false,
984 detect_application_packages: false,
985 detect_system_packages: false,
986 detect_packages_in_compiled: false,
987 detect_copyrights: false,
988 detect_generated: false,
989 detect_emails: false,
990 detect_urls: false,
991 max_emails: 50,
992 max_urls: 50,
993 timeout_seconds: 120.0,
994 };
995 let scanned = scan_single_file(
996 "script.py",
997 "#!/usr/bin/env python3\nprint(\"hello\")\n",
998 &options,
999 );
1000
1001 assert!(scanned.sha1.is_none());
1002 assert!(scanned.md5.is_none());
1003 assert!(scanned.sha256.is_none());
1004 assert!(scanned.sha1_git.is_none());
1005 assert!(scanned.mime_type.is_none());
1006 assert!(scanned.date.is_none());
1007 assert!(scanned.programming_language.is_none());
1008 assert!(scanned.is_binary.is_none());
1009 assert!(scanned.is_text.is_none());
1010 assert!(scanned.is_archive.is_none());
1011 assert!(scanned.is_media.is_none());
1012 assert!(scanned.is_script.is_none());
1013 assert!(scanned.is_source.is_none());
1014 }
1015
1016 #[test]
1017 fn scanner_skips_package_parsing_when_disabled() {
1018 let options = TextDetectionOptions {
1019 collect_info: false,
1020 detect_packages: false,
1021 detect_application_packages: false,
1022 detect_system_packages: false,
1023 detect_packages_in_compiled: false,
1024 detect_copyrights: false,
1025 detect_generated: false,
1026 detect_emails: false,
1027 detect_urls: false,
1028 max_emails: 50,
1029 max_urls: 50,
1030 timeout_seconds: 120.0,
1031 };
1032 let scanned = scan_single_file(
1033 "package.json",
1034 r#"{"name":"demo","version":"1.0.0"}"#,
1035 &options,
1036 );
1037
1038 assert!(
1039 scanned.package_data.is_empty(),
1040 "package_data: {:#?}",
1041 scanned.package_data
1042 );
1043 }
1044
1045 #[test]
1046 fn scanner_parses_package_manifests_when_enabled() {
1047 let options = TextDetectionOptions {
1048 collect_info: false,
1049 detect_packages: true,
1050 detect_application_packages: true,
1051 detect_system_packages: false,
1052 detect_packages_in_compiled: false,
1053 detect_copyrights: false,
1054 detect_generated: false,
1055 detect_emails: false,
1056 detect_urls: false,
1057 max_emails: 50,
1058 max_urls: 50,
1059 timeout_seconds: 120.0,
1060 };
1061 let scanned = scan_single_file(
1062 "package.json",
1063 r#"{"name":"demo","version":"1.0.0"}"#,
1064 &options,
1065 );
1066
1067 assert_eq!(
1068 scanned.package_data.len(),
1069 1,
1070 "package_data: {:#?}",
1071 scanned.package_data
1072 );
1073 }
1074
1075 #[test]
1076 fn scanner_parses_oversized_rpm_in_package_only_mode_without_size_warning() {
1077 let temp_dir = TempDir::new().expect("create temp dir");
1078 let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-demo");
1079
1080 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1081 let collected = collect_paths(temp_dir.path(), 0, &[]);
1082 let result = process_collected(
1083 &collected,
1084 progress,
1085 None,
1086 LicenseScanOptions::default(),
1087 &TextDetectionOptions {
1088 collect_info: false,
1089 detect_packages: true,
1090 detect_application_packages: true,
1091 detect_system_packages: false,
1092 detect_packages_in_compiled: false,
1093 detect_copyrights: false,
1094 detect_generated: false,
1095 detect_emails: false,
1096 detect_urls: false,
1097 max_emails: 50,
1098 max_urls: 50,
1099 timeout_seconds: 120.0,
1100 },
1101 );
1102
1103 let scanned = result
1104 .files
1105 .into_iter()
1106 .find(|entry| {
1107 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1108 })
1109 .expect("scanned file entry");
1110
1111 assert!(
1112 scanned.scan_errors.is_empty(),
1113 "scan_errors: {:#?}",
1114 scanned.scan_errors
1115 );
1116 assert_eq!(
1117 scanned.package_data.len(),
1118 1,
1119 "package_data: {:#?}",
1120 scanned.package_data
1121 );
1122 assert_eq!(
1123 scanned.package_data[0].datasource_id,
1124 Some(DatasourceId::RpmArchive)
1125 );
1126 assert_eq!(
1127 scanned.package_data[0].name.as_deref(),
1128 Some("oversized-demo")
1129 );
1130 assert_eq!(scanned.package_data[0].version.as_deref(), Some("1.0-1"));
1131 }
1132
1133 #[test]
1134 fn scanner_parses_oversized_rpm_with_info_without_timeout_or_size_warning() {
1135 let temp_dir = TempDir::new().expect("create temp dir");
1136 let file_path = build_sparse_oversized_rpm(&temp_dir, "oversized-info-demo");
1137
1138 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1139 let collected = collect_paths(temp_dir.path(), 0, &[]);
1140 let result = process_collected(
1141 &collected,
1142 progress,
1143 None,
1144 LicenseScanOptions::default(),
1145 &TextDetectionOptions {
1146 collect_info: true,
1147 detect_packages: true,
1148 detect_application_packages: true,
1149 detect_system_packages: false,
1150 detect_packages_in_compiled: false,
1151 detect_copyrights: false,
1152 detect_generated: false,
1153 detect_emails: false,
1154 detect_urls: false,
1155 max_emails: 50,
1156 max_urls: 50,
1157 timeout_seconds: 120.0,
1158 },
1159 );
1160
1161 let scanned = result
1162 .files
1163 .into_iter()
1164 .find(|entry| {
1165 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1166 })
1167 .expect("scanned file entry");
1168
1169 assert!(
1170 scanned.scan_errors.is_empty(),
1171 "scan_errors: {:#?}",
1172 scanned.scan_errors
1173 );
1174 assert_eq!(
1175 scanned.package_data.len(),
1176 1,
1177 "package_data: {:#?}",
1178 scanned.package_data
1179 );
1180 assert_eq!(
1181 scanned.package_data[0].datasource_id,
1182 Some(DatasourceId::RpmArchive)
1183 );
1184 assert_eq!(
1185 scanned.package_data[0].name.as_deref(),
1186 Some("oversized-info-demo")
1187 );
1188 assert!(scanned.sha1.is_some());
1189 assert!(scanned.md5.is_some());
1190 assert!(scanned.sha256.is_some());
1191 assert!(scanned.sha1_git.is_some());
1192 assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1193 assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1194 assert_eq!(scanned.is_binary, Some(true));
1195 assert_eq!(scanned.is_text, Some(false));
1196 assert_eq!(scanned.is_archive, Some(true));
1197 }
1198
1199 #[test]
1200 fn scanner_parses_oversized_pack_rpm_in_package_only_mode_without_size_warning() {
1201 let temp_dir = TempDir::new().expect("create temp dir");
1202 let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-demo");
1203
1204 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1205 let collected = collect_paths(temp_dir.path(), 0, &[]);
1206 let result = process_collected(
1207 &collected,
1208 progress,
1209 None,
1210 LicenseScanOptions::default(),
1211 &TextDetectionOptions {
1212 collect_info: false,
1213 detect_packages: true,
1214 detect_application_packages: true,
1215 detect_system_packages: false,
1216 detect_packages_in_compiled: false,
1217 detect_copyrights: false,
1218 detect_generated: false,
1219 detect_emails: false,
1220 detect_urls: false,
1221 max_emails: 50,
1222 max_urls: 50,
1223 timeout_seconds: 120.0,
1224 },
1225 );
1226
1227 let scanned = result
1228 .files
1229 .into_iter()
1230 .find(|entry| {
1231 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1232 })
1233 .expect("scanned file entry");
1234
1235 assert!(
1236 scanned.scan_errors.is_empty(),
1237 "scan_errors: {:#?}",
1238 scanned.scan_errors
1239 );
1240 assert_eq!(
1241 scanned.package_data.len(),
1242 1,
1243 "package_data: {:#?}",
1244 scanned.package_data
1245 );
1246 assert_eq!(
1247 scanned.package_data[0].datasource_id,
1248 Some(DatasourceId::RpmArchive)
1249 );
1250 assert_eq!(
1251 scanned.package_data[0].name.as_deref(),
1252 Some("oversized-pack-demo")
1253 );
1254 }
1255
1256 #[test]
1257 fn scanner_parses_oversized_pack_rpm_with_info_without_timeout_or_size_warning() {
1258 let temp_dir = TempDir::new().expect("create temp dir");
1259 let file_path = build_sparse_oversized_pack_rpm(&temp_dir, "oversized-pack-info-demo");
1260
1261 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1262 let collected = collect_paths(temp_dir.path(), 0, &[]);
1263 let result = process_collected(
1264 &collected,
1265 progress,
1266 None,
1267 LicenseScanOptions::default(),
1268 &TextDetectionOptions {
1269 collect_info: true,
1270 detect_packages: true,
1271 detect_application_packages: true,
1272 detect_system_packages: false,
1273 detect_packages_in_compiled: false,
1274 detect_copyrights: false,
1275 detect_generated: false,
1276 detect_emails: false,
1277 detect_urls: false,
1278 max_emails: 50,
1279 max_urls: 50,
1280 timeout_seconds: 120.0,
1281 },
1282 );
1283
1284 let scanned = result
1285 .files
1286 .into_iter()
1287 .find(|entry| {
1288 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1289 })
1290 .expect("scanned file entry");
1291
1292 assert!(
1293 scanned.scan_errors.is_empty(),
1294 "scan_errors: {:#?}",
1295 scanned.scan_errors
1296 );
1297 assert_eq!(
1298 scanned.package_data.len(),
1299 1,
1300 "package_data: {:#?}",
1301 scanned.package_data
1302 );
1303 assert_eq!(
1304 scanned.package_data[0].datasource_id,
1305 Some(DatasourceId::RpmArchive)
1306 );
1307 assert_eq!(
1308 scanned.package_data[0].name.as_deref(),
1309 Some("oversized-pack-info-demo")
1310 );
1311 assert!(scanned.sha1.is_some());
1312 assert!(scanned.md5.is_some());
1313 assert!(scanned.sha256.is_some());
1314 assert!(scanned.sha1_git.is_some());
1315 assert_eq!(scanned.mime_type.as_deref(), Some("application/x-rpm"));
1316 assert_eq!(scanned.file_type_label.as_deref(), Some("RPM package"));
1317 assert_eq!(scanned.is_binary, Some(true));
1318 assert_eq!(scanned.is_text, Some(false));
1319 assert_eq!(scanned.is_archive, Some(true));
1320 }
1321
1322 #[test]
1323 fn scanner_skips_application_packages_when_only_system_packages_enabled() {
1324 let options = TextDetectionOptions {
1325 collect_info: false,
1326 detect_packages: true,
1327 detect_application_packages: false,
1328 detect_system_packages: true,
1329 detect_packages_in_compiled: false,
1330 detect_copyrights: false,
1331 detect_generated: false,
1332 detect_emails: false,
1333 detect_urls: false,
1334 max_emails: 50,
1335 max_urls: 50,
1336 timeout_seconds: 120.0,
1337 };
1338 let scanned = scan_single_file(
1339 "package.json",
1340 r#"{"name":"demo","version":"1.0.0"}"#,
1341 &options,
1342 );
1343
1344 assert!(
1345 scanned.package_data.is_empty(),
1346 "package_data: {:#?}",
1347 scanned.package_data
1348 );
1349 }
1350
1351 #[test]
1352 fn scanner_parses_system_package_files_when_enabled() {
1353 let options = TextDetectionOptions {
1354 collect_info: false,
1355 detect_packages: true,
1356 detect_application_packages: false,
1357 detect_system_packages: true,
1358 detect_packages_in_compiled: false,
1359 detect_copyrights: false,
1360 detect_generated: false,
1361 detect_emails: false,
1362 detect_urls: false,
1363 max_emails: 50,
1364 max_urls: 50,
1365 timeout_seconds: 120.0,
1366 };
1367 let scanned = scan_file_at_relative_path(
1368 "var/lib/dpkg/status",
1369 b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1370 &options,
1371 );
1372
1373 assert!(
1374 !scanned.package_data.is_empty(),
1375 "package_data: {:#?}",
1376 scanned.package_data
1377 );
1378 }
1379
1380 #[test]
1381 fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1382 if std::process::Command::new("go")
1383 .arg("version")
1384 .status()
1385 .is_err()
1386 {
1387 return;
1388 }
1389
1390 let temp_dir = TempDir::new().expect("create temp dir");
1391 fs::write(
1392 temp_dir.path().join("go.mod"),
1393 "module example.com/demo\n\ngo 1.23.0\n",
1394 )
1395 .expect("write go.mod");
1396 fs::write(
1397 temp_dir.path().join("main.go"),
1398 "package main\nfunc main() {}\n",
1399 )
1400 .expect("write main.go");
1401 let file_path = temp_dir.path().join("demo");
1402 let status = std::process::Command::new("go")
1403 .current_dir(temp_dir.path())
1404 .args(["build", "-o"])
1405 .arg(&file_path)
1406 .status()
1407 .expect("run go build");
1408 assert!(status.success());
1409
1410 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1411 let collected = collect_paths(temp_dir.path(), 0, &[]);
1412
1413 let without_compiled = process_collected(
1414 &collected,
1415 Arc::clone(&progress),
1416 None,
1417 LicenseScanOptions::default(),
1418 &TextDetectionOptions {
1419 collect_info: false,
1420 detect_packages: true,
1421 detect_application_packages: true,
1422 detect_system_packages: false,
1423 detect_packages_in_compiled: false,
1424 detect_copyrights: false,
1425 detect_generated: false,
1426 detect_emails: false,
1427 detect_urls: false,
1428 max_emails: 50,
1429 max_urls: 50,
1430 timeout_seconds: 120.0,
1431 },
1432 );
1433 let with_compiled = process_collected(
1434 &collected,
1435 progress,
1436 None,
1437 LicenseScanOptions::default(),
1438 &TextDetectionOptions {
1439 collect_info: false,
1440 detect_packages: true,
1441 detect_application_packages: true,
1442 detect_system_packages: false,
1443 detect_packages_in_compiled: true,
1444 detect_copyrights: false,
1445 detect_generated: false,
1446 detect_emails: false,
1447 detect_urls: false,
1448 max_emails: 50,
1449 max_urls: 50,
1450 timeout_seconds: 120.0,
1451 },
1452 );
1453
1454 let without_compiled = without_compiled
1455 .files
1456 .into_iter()
1457 .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1458 .expect("compiled artifact present");
1459 let with_compiled = with_compiled
1460 .files
1461 .into_iter()
1462 .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1463 .expect("compiled artifact present");
1464
1465 assert!(
1466 without_compiled.package_data.is_empty(),
1467 "package_data: {:#?}",
1468 without_compiled.package_data
1469 );
1470 assert!(!with_compiled.package_data.is_empty());
1471 }
1472
1473 #[test]
1474 fn scanner_parses_windows_executable_packages_under_normal_package_scan() {
1475 let temp_dir = TempDir::new().expect("create temp dir");
1476 let file_path = temp_dir.path().join("libiconv2.dll");
1477 let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1478 .expect("read PE fixture");
1479 fs::write(&file_path, fixture).expect("write PE fixture");
1480
1481 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1482 let collected = collect_paths(temp_dir.path(), 0, &[]);
1483
1484 let without_package = process_collected(
1485 &collected,
1486 Arc::clone(&progress),
1487 None,
1488 LicenseScanOptions::default(),
1489 &TextDetectionOptions {
1490 collect_info: false,
1491 detect_packages: false,
1492 detect_application_packages: false,
1493 detect_system_packages: false,
1494 detect_packages_in_compiled: false,
1495 detect_copyrights: false,
1496 detect_generated: false,
1497 detect_emails: false,
1498 detect_urls: false,
1499 max_emails: 50,
1500 max_urls: 50,
1501 timeout_seconds: 120.0,
1502 },
1503 );
1504 let with_package = process_collected(
1505 &collected,
1506 progress,
1507 None,
1508 LicenseScanOptions::default(),
1509 &TextDetectionOptions {
1510 collect_info: false,
1511 detect_packages: true,
1512 detect_application_packages: true,
1513 detect_system_packages: false,
1514 detect_packages_in_compiled: false,
1515 detect_copyrights: false,
1516 detect_generated: false,
1517 detect_emails: false,
1518 detect_urls: false,
1519 max_emails: 50,
1520 max_urls: 50,
1521 timeout_seconds: 120.0,
1522 },
1523 );
1524
1525 let without_package = without_package
1526 .files
1527 .into_iter()
1528 .find(|entry| {
1529 entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1530 })
1531 .expect("compiled artifact present");
1532 let with_package = with_package
1533 .files
1534 .into_iter()
1535 .find(|entry| {
1536 entry.file_type == FileType::File && entry.path.ends_with("/libiconv2.dll")
1537 })
1538 .expect("compiled artifact present");
1539
1540 assert!(without_package.package_data.is_empty());
1541 assert_eq!(with_package.package_data.len(), 1);
1542 assert_eq!(
1543 with_package.package_data[0].package_type,
1544 Some(FilePackageType::Winexe)
1545 );
1546 assert_eq!(
1547 with_package.package_data[0].datasource_id,
1548 Some(DatasourceId::WindowsExecutable)
1549 );
1550 }
1551
1552 #[test]
1553 fn scanner_keeps_nsis_and_windows_executable_package_data_together() {
1554 let temp_dir = TempDir::new().expect("create temp dir");
1555 let file_path = temp_dir.path().join("nsis-with-version.exe");
1556 let mut fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1557 .expect("read PE fixture");
1558 if fixture.len() < 70_000 {
1559 fixture.resize(70_000, 0);
1560 }
1561 fixture.extend_from_slice(b"Nullsoft.NSIS.exehead");
1562 fs::write(&file_path, fixture).expect("write synthetic NSIS PE fixture");
1563
1564 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1565 let collected = collect_paths(temp_dir.path(), 0, &[]);
1566 let result = process_collected(
1567 &collected,
1568 progress,
1569 None,
1570 LicenseScanOptions::default(),
1571 &TextDetectionOptions {
1572 collect_info: false,
1573 detect_packages: true,
1574 detect_application_packages: true,
1575 detect_system_packages: false,
1576 detect_packages_in_compiled: false,
1577 detect_copyrights: false,
1578 detect_generated: false,
1579 detect_emails: false,
1580 detect_urls: false,
1581 max_emails: 50,
1582 max_urls: 50,
1583 timeout_seconds: 120.0,
1584 },
1585 );
1586
1587 let scanned = result
1588 .files
1589 .into_iter()
1590 .find(|entry| {
1591 entry.file_type == FileType::File && entry.path.ends_with("/nsis-with-version.exe")
1592 })
1593 .expect("compiled artifact present");
1594
1595 assert_eq!(
1596 scanned.package_data.len(),
1597 2,
1598 "package_data: {:#?}",
1599 scanned.package_data
1600 );
1601 assert!(
1602 scanned
1603 .package_data
1604 .iter()
1605 .any(|pkg| pkg.datasource_id == Some(DatasourceId::NsisInstaller))
1606 );
1607 assert!(
1608 scanned
1609 .package_data
1610 .iter()
1611 .any(|pkg| pkg.datasource_id == Some(DatasourceId::WindowsExecutable))
1612 );
1613 }
1614
1615 #[test]
1616 fn scanner_detects_license_from_font_metadata() {
1617 let temp_dir = TempDir::new().expect("create temp dir");
1618 let file_path = temp_dir.path().join("Lato-Bold.ttf");
1619 let fixture = fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read font fixture");
1620 fs::write(&file_path, fixture).expect("write font fixture");
1621
1622 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1623 let collected = collect_paths(temp_dir.path(), 0, &[]);
1624 let engine =
1625 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1626 let result = process_collected(
1627 &collected,
1628 progress,
1629 Some(engine),
1630 LicenseScanOptions::default(),
1631 &TextDetectionOptions::default(),
1632 );
1633 let scanned = result
1634 .files
1635 .into_iter()
1636 .find(|entry| {
1637 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1638 })
1639 .expect("scanned file entry");
1640
1641 assert!(
1642 scanned.license_expression.is_some(),
1643 "license detections: {:#?}",
1644 scanned.license_detections
1645 );
1646 assert!(
1647 scanned
1648 .license_expression
1649 .as_deref()
1650 .is_some_and(
1651 |expression| expression.contains("OFL-1.1") || expression.contains("ofl-1.1")
1652 ),
1653 "license expression: {:?}",
1654 scanned.license_expression
1655 );
1656 }
1657
1658 #[test]
1659 fn scanner_detects_license_from_windows_executable_metadata() {
1660 let temp_dir = TempDir::new().expect("create temp dir");
1661 let file_path = temp_dir.path().join("libiconv2.dll");
1662 let fixture = fs::read("testdata/compiled-binary-golden/win_pe/libiconv2.dll")
1663 .expect("read PE fixture");
1664 fs::write(&file_path, fixture).expect("write PE fixture");
1665
1666 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1667 let collected = collect_paths(temp_dir.path(), 0, &[]);
1668 let engine =
1669 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
1670 let result = process_collected(
1671 &collected,
1672 progress,
1673 Some(engine),
1674 LicenseScanOptions::default(),
1675 &TextDetectionOptions::default(),
1676 );
1677 let scanned = result
1678 .files
1679 .into_iter()
1680 .find(|entry| {
1681 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
1682 })
1683 .expect("scanned file entry");
1684
1685 assert!(
1686 scanned.license_expression.is_some(),
1687 "license detections: {:#?}",
1688 scanned.license_detections
1689 );
1690 assert!(
1691 scanned
1692 .license_expression
1693 .as_deref()
1694 .is_some_and(|expression| {
1695 expression.contains("lgpl") || expression.contains("LGPL")
1696 }),
1697 "license expression: {:?}",
1698 scanned.license_expression
1699 );
1700 }
1701
1702 #[test]
1703 fn scanner_detects_cc_by_license_from_markdown_comment_banner() {
1704 let scanned = scan_single_file_with_license_engine(
1705 "navbar.md",
1706 "<!-- Documentation licensed under CC BY 4.0 -->\n<!-- License available at https://creativecommons.org/licenses/by/4.0/ -->\n",
1707 &TextDetectionOptions::default(),
1708 );
1709
1710 assert!(
1711 scanned
1712 .license_expression
1713 .as_deref()
1714 .is_some_and(|expression| {
1715 expression.contains("cc-by-4.0") || expression.contains("CC-BY-4.0")
1716 }),
1717 "license expression: {:?}",
1718 scanned.license_expression
1719 );
1720 }
1721
1722 #[test]
1723 fn scanner_detects_mit_license_from_shields_badge_markdown() {
1724 let scanned = scan_single_file_with_license_engine(
1725 "README.md",
1726 "[](https://opensource.org/licenses/MIT)\n",
1727 &TextDetectionOptions::default(),
1728 );
1729
1730 assert!(
1731 scanned
1732 .license_expression
1733 .as_deref()
1734 .is_some_and(|expression| {
1735 expression.contains("mit") || expression.contains("MIT")
1736 }),
1737 "license expression: {:?}",
1738 scanned.license_expression
1739 );
1740 }
1741
1742 #[test]
1743 fn scanner_detects_apache_license_from_markdown_readme_phrase() {
1744 let scanned = scan_single_file_with_license_engine(
1745 "README.md",
1746 "This crate is distributed under the terms of the Apache License (Version 2.0).\n",
1747 &TextDetectionOptions::default(),
1748 );
1749
1750 assert!(
1751 scanned
1752 .license_expression
1753 .as_deref()
1754 .is_some_and(|expression| {
1755 expression.contains("apache-2.0") || expression.contains("Apache-2.0")
1756 }),
1757 "license expression: {:?}",
1758 scanned.license_expression
1759 );
1760 }
1761
1762 #[test]
1763 fn scanner_sets_is_source_only_when_info_enabled() {
1764 let without_info = TextDetectionOptions {
1765 collect_info: false,
1766 detect_packages: false,
1767 detect_application_packages: false,
1768 detect_system_packages: false,
1769 detect_packages_in_compiled: false,
1770 detect_copyrights: false,
1771 detect_generated: false,
1772 detect_emails: false,
1773 detect_urls: false,
1774 max_emails: 50,
1775 max_urls: 50,
1776 timeout_seconds: 120.0,
1777 };
1778 let with_info = TextDetectionOptions {
1779 collect_info: true,
1780 ..without_info.clone()
1781 };
1782
1783 let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1784 let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1785
1786 assert_eq!(scanned_without_info.is_source, None);
1787 assert_eq!(scanned_with_info.is_source, Some(true));
1788 }
1789
1790 #[test]
1791 fn directory_omits_info_fields_when_info_disabled() {
1792 let temp_dir = TempDir::new().expect("create temp dir");
1793 fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1794
1795 let collected = collect_paths(temp_dir.path(), 0, &[]);
1796 let result = process_collected(
1797 &collected,
1798 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1799 None,
1800 LicenseScanOptions::default(),
1801 &TextDetectionOptions {
1802 collect_info: false,
1803 detect_packages: false,
1804 detect_application_packages: false,
1805 detect_system_packages: false,
1806 detect_packages_in_compiled: false,
1807 detect_copyrights: false,
1808 detect_generated: false,
1809 detect_emails: false,
1810 detect_urls: false,
1811 max_emails: 50,
1812 max_urls: 50,
1813 timeout_seconds: 120.0,
1814 },
1815 );
1816
1817 let directory = result
1818 .files
1819 .into_iter()
1820 .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1821 .expect("directory entry");
1822
1823 assert!(directory.date.is_none());
1824 assert!(directory.file_type_label.is_none());
1825 assert!(directory.is_binary.is_none());
1826 assert!(directory.is_text.is_none());
1827 assert!(directory.is_archive.is_none());
1828 assert!(directory.is_media.is_none());
1829 assert!(directory.is_source.is_none());
1830 assert!(directory.is_script.is_none());
1831 }
1832
1833 #[test]
1834 fn directory_includes_info_fields_when_info_enabled() {
1835 let temp_dir = TempDir::new().expect("create temp dir");
1836 fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1837
1838 let collected = collect_paths(temp_dir.path(), 0, &[]);
1839 let result = process_collected(
1840 &collected,
1841 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1842 None,
1843 LicenseScanOptions::default(),
1844 &TextDetectionOptions {
1845 collect_info: true,
1846 detect_packages: false,
1847 detect_application_packages: false,
1848 detect_system_packages: false,
1849 detect_packages_in_compiled: false,
1850 detect_copyrights: false,
1851 detect_generated: false,
1852 detect_emails: false,
1853 detect_urls: false,
1854 max_emails: 50,
1855 max_urls: 50,
1856 timeout_seconds: 120.0,
1857 },
1858 );
1859
1860 let directory = result
1861 .files
1862 .into_iter()
1863 .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1864 .expect("directory entry");
1865
1866 assert!(directory.date.is_none());
1867 assert!(directory.file_type_label.is_none());
1868 assert_eq!(directory.is_binary, Some(false));
1869 assert_eq!(directory.is_text, Some(false));
1870 assert_eq!(directory.is_archive, Some(false));
1871 assert_eq!(directory.is_media, Some(false));
1872 assert_eq!(directory.is_source, Some(false));
1873 assert_eq!(directory.is_script, Some(false));
1874 assert_eq!(directory.files_count, Some(0));
1875 assert_eq!(directory.dirs_count, Some(0));
1876 assert_eq!(directory.size_count, Some(0));
1877 }
1878
1879 #[test]
1880 fn collect_paths_includes_root_directory_entry() {
1881 let temp_dir = TempDir::new().expect("create temp dir");
1882 fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1883 fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1884 .expect("write nested file");
1885
1886 let collected = collect_paths(temp_dir.path(), 0, &[]);
1887
1888 assert!(
1889 collected
1890 .directories
1891 .iter()
1892 .any(|(path, _)| path == temp_dir.path())
1893 );
1894 }
1895
1896 #[test]
1897 fn collect_paths_supports_single_file_input() {
1898 let temp_dir = TempDir::new().expect("create temp dir");
1899 let file_path = temp_dir.path().join("main.rs");
1900 fs::write(&file_path, "fn main() {}\n").expect("write file");
1901
1902 let collected = collect_paths(&file_path, 0, &[]);
1903
1904 assert_eq!(collected.files.len(), 1);
1905 assert!(collected.directories.is_empty());
1906 assert_eq!(collected.files[0].0, file_path);
1907 }
1908
1909 #[cfg(unix)]
1910 #[test]
1911 fn collect_selected_paths_does_not_walk_unselected_siblings() {
1912 use std::os::unix::fs::PermissionsExt;
1913
1914 let temp_dir = TempDir::new().expect("create temp dir");
1915 let root = temp_dir.path();
1916 fs::create_dir_all(root.join("selected/docs")).expect("create selected dir");
1917 fs::create_dir_all(root.join("blocked/secret")).expect("create blocked dir");
1918 fs::write(root.join("selected/docs/guide.md"), "# guide\n").expect("write guide");
1919
1920 let blocked = root.join("blocked");
1921 let mut perms = fs::metadata(&blocked)
1922 .expect("blocked metadata")
1923 .permissions();
1924 perms.set_mode(0o000);
1925 fs::set_permissions(&blocked, perms).expect("remove blocked permissions");
1926
1927 let collected = collect_selected_paths(
1928 root,
1929 &[CollectionFrontier {
1930 path: PathBuf::from("selected"),
1931 recurse: true,
1932 }],
1933 0,
1934 &[],
1935 );
1936
1937 let mut restore = fs::metadata(&blocked)
1938 .expect("blocked metadata")
1939 .permissions();
1940 restore.set_mode(0o755);
1941 fs::set_permissions(&blocked, restore).expect("restore blocked permissions");
1942
1943 assert!(
1944 collected.collection_errors.is_empty(),
1945 "{:#?}",
1946 collected.collection_errors
1947 );
1948 assert!(
1949 collected
1950 .files
1951 .iter()
1952 .any(|(path, _)| path == &root.join("selected/docs/guide.md"))
1953 );
1954 assert!(
1955 collected
1956 .files
1957 .iter()
1958 .all(|(path, _): &(PathBuf, fs::Metadata)| !path.starts_with(&blocked))
1959 );
1960 }
1961
1962 #[test]
1963 fn collect_selected_paths_respects_excluded_ancestor_directories() {
1964 let temp_dir = TempDir::new().expect("create temp dir");
1965 let root = temp_dir.path();
1966 fs::create_dir_all(root.join(".git")).expect("create git dir");
1967 fs::write(
1968 root.join(".git/config"),
1969 "[core]\nrepositoryformatversion = 0\n",
1970 )
1971 .expect("write git config");
1972
1973 let exclude_patterns =
1974 build_collection_exclude_patterns(root, &root.join(".provenant-cache"));
1975 let collected = collect_selected_paths(
1976 root,
1977 &[CollectionFrontier {
1978 path: PathBuf::from(".git/config"),
1979 recurse: false,
1980 }],
1981 0,
1982 &exclude_patterns,
1983 );
1984
1985 assert!(collected.files.is_empty());
1986 assert!(collected.directories.iter().all(|(path, _)| path == root));
1987 assert_eq!(collected.excluded_count, 1);
1988 }
1989
1990 #[test]
1991 fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1992 let temp_dir = TempDir::new().expect("create temp dir");
1993 fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1994 fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1995
1996 let collected = collect_paths(temp_dir.path(), 0, &[]);
1997 let result = process_collected_with_memory_limit(
1998 &collected,
1999 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2000 None,
2001 LicenseScanOptions::default(),
2002 &TextDetectionOptions {
2003 collect_info: false,
2004 detect_packages: false,
2005 detect_application_packages: false,
2006 detect_system_packages: false,
2007 detect_packages_in_compiled: false,
2008 detect_copyrights: false,
2009 detect_generated: false,
2010 detect_emails: false,
2011 detect_urls: false,
2012 max_emails: 50,
2013 max_urls: 50,
2014 timeout_seconds: 120.0,
2015 },
2016 MemoryMode::Limit(1),
2017 );
2018
2019 assert_eq!(result.files.len(), 3);
2020 }
2021
2022 #[test]
2023 fn process_collected_with_negative_one_uses_disk_only_mode() {
2024 let temp_dir = TempDir::new().expect("create temp dir");
2025 fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
2026
2027 let collected = collect_paths(temp_dir.path(), 0, &[]);
2028 let result = process_collected_with_memory_limit(
2029 &collected,
2030 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
2031 None,
2032 LicenseScanOptions::default(),
2033 &TextDetectionOptions {
2034 collect_info: false,
2035 detect_packages: false,
2036 detect_application_packages: false,
2037 detect_system_packages: false,
2038 detect_packages_in_compiled: false,
2039 detect_copyrights: false,
2040 detect_generated: false,
2041 detect_emails: false,
2042 detect_urls: false,
2043 max_emails: 50,
2044 max_urls: 50,
2045 timeout_seconds: 120.0,
2046 },
2047 MemoryMode::StreamUnlimited,
2048 );
2049
2050 assert_eq!(result.files.len(), 2);
2051 }
2052}