1mod collect;
2mod process;
3
4use crate::license_detection::LicenseDetectionEngine;
5use crate::models::FileInfo;
6
7pub struct ProcessResult {
8 pub files: Vec<FileInfo>,
9 pub excluded_count: usize,
10}
11
12#[derive(Debug, Clone, Copy, Default)]
13pub struct LicenseScanOptions {
14 pub include_text: bool,
15 pub include_text_diagnostics: bool,
16 pub include_diagnostics: bool,
17 pub unknown_licenses: bool,
18 pub min_score: u8,
19}
20
21#[derive(Debug, Clone)]
22pub struct TextDetectionOptions {
23 pub collect_info: bool,
24 pub detect_packages: bool,
25 pub detect_application_packages: bool,
26 pub detect_system_packages: bool,
27 pub detect_packages_in_compiled: bool,
28 pub detect_copyrights: bool,
29 pub detect_generated: bool,
30 pub detect_emails: bool,
31 pub detect_urls: bool,
32 pub max_emails: usize,
33 pub max_urls: usize,
34 pub timeout_seconds: f64,
35}
36
37impl Default for TextDetectionOptions {
38 fn default() -> Self {
39 Self {
40 collect_info: false,
41 detect_packages: false,
42 detect_application_packages: false,
43 detect_system_packages: false,
44 detect_packages_in_compiled: false,
45 detect_copyrights: true,
46 detect_generated: false,
47 detect_emails: false,
48 detect_urls: false,
49 max_emails: 50,
50 max_urls: 50,
51 timeout_seconds: 120.0,
52 }
53 }
54}
55
56pub fn scan_options_fingerprint(
57 text_options: &TextDetectionOptions,
58 license_options: LicenseScanOptions,
59 license_engine: Option<&LicenseDetectionEngine>,
60) -> String {
61 let (license_enabled, rules_count, first_rule_id, last_rule_id) = match license_engine {
62 Some(engine) => {
63 let rules = &engine.index().rules_by_rid;
64 (
65 true,
66 rules.len(),
67 rules
68 .first()
69 .map(|rule| rule.identifier.as_str())
70 .unwrap_or(""),
71 rules
72 .last()
73 .map(|rule| rule.identifier.as_str())
74 .unwrap_or(""),
75 )
76 }
77 None => (false, 0, "", ""),
78 };
79
80 format!(
81 "tool_version={};info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};generated={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};rules_count={};first_rule_id={};last_rule_id={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
82 env!("CARGO_PKG_VERSION"),
83 text_options.collect_info,
84 text_options.detect_packages,
85 text_options.detect_application_packages,
86 text_options.detect_system_packages,
87 text_options.detect_packages_in_compiled,
88 text_options.detect_copyrights,
89 text_options.detect_generated,
90 text_options.detect_emails,
91 text_options.detect_urls,
92 text_options.max_emails,
93 text_options.max_urls,
94 text_options.timeout_seconds,
95 license_enabled,
96 rules_count,
97 first_rule_id,
98 last_rule_id,
99 license_options.include_text,
100 license_options.include_text_diagnostics,
101 license_options.include_diagnostics,
102 license_options.unknown_licenses,
103 license_options.min_score,
104 )
105}
106
107pub use self::collect::{CollectedPaths, collect_paths};
108#[allow(unused_imports)]
109pub use self::process::{process_collected, process_collected_with_memory_limit};
110
111#[cfg(test)]
112mod tests {
113 use std::fs;
114 use std::sync::Arc;
115
116 use tempfile::TempDir;
117
118 use crate::license_detection::LicenseDetectionEngine;
119 use crate::models::FileType;
120 use crate::progress::{ProgressMode, ScanProgress};
121
122 use super::{
123 LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected,
124 process_collected_with_memory_limit,
125 };
126
127 #[test]
128 fn default_options_keep_copyright_detection_enabled() {
129 let options = TextDetectionOptions::default();
130 assert!(!options.detect_packages);
131 assert!(options.detect_copyrights);
132 }
133
134 fn scan_single_file(
135 file_name: &str,
136 content: &str,
137 options: &TextDetectionOptions,
138 ) -> crate::models::FileInfo {
139 let temp_dir = TempDir::new().expect("create temp dir");
140 let file_path = temp_dir.path().join(file_name);
141 fs::write(&file_path, content).expect("write test file");
142
143 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
144 let collected = collect_paths(temp_dir.path(), 0, &[]);
145 let result = process_collected(
146 &collected,
147 progress,
148 None,
149 LicenseScanOptions::default(),
150 options,
151 );
152
153 result
154 .files
155 .into_iter()
156 .find(|entry| {
157 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
158 })
159 .expect("scanned file entry")
160 }
161
162 fn scan_file_at_relative_path(
163 relative_path: &str,
164 content: &[u8],
165 options: &TextDetectionOptions,
166 ) -> crate::models::FileInfo {
167 let temp_dir = TempDir::new().expect("create temp dir");
168 let file_path = temp_dir.path().join(relative_path);
169 if let Some(parent) = file_path.parent() {
170 fs::create_dir_all(parent).expect("create parent dirs");
171 }
172 fs::write(&file_path, content).expect("write test file");
173
174 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
175 let collected = collect_paths(temp_dir.path(), 0, &[]);
176 let result = process_collected(
177 &collected,
178 progress,
179 None,
180 LicenseScanOptions::default(),
181 options,
182 );
183
184 result
185 .files
186 .into_iter()
187 .find(|entry| {
188 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
189 })
190 .expect("scanned file entry")
191 }
192
193 #[test]
194 fn scanner_reports_repeated_email_occurrences() {
195 let options = TextDetectionOptions {
196 collect_info: false,
197 detect_packages: false,
198 detect_application_packages: false,
199 detect_system_packages: false,
200 detect_packages_in_compiled: false,
201 detect_copyrights: false,
202 detect_generated: false,
203 detect_emails: true,
204 detect_urls: false,
205 max_emails: 50,
206 max_urls: 50,
207 timeout_seconds: 120.0,
208 };
209 let scanned = scan_single_file(
210 "contacts.txt",
211 "linux@3ware.com\nlinux@3ware.com\nandre@suse.com\nlinux@3ware.com\n",
212 &options,
213 );
214
215 let emails: Vec<(&str, usize)> = scanned
216 .emails
217 .iter()
218 .map(|email| (email.email.as_str(), email.start_line))
219 .collect();
220
221 assert_eq!(emails.len(), 4, "emails: {emails:#?}");
222 assert_eq!(
223 emails,
224 vec![
225 ("linux@3ware.com", 1),
226 ("linux@3ware.com", 2),
227 ("andre@suse.com", 3),
228 ("linux@3ware.com", 4),
229 ]
230 );
231 }
232
233 #[test]
234 fn scanner_skips_pem_certificate_text_detection() {
235 let options = TextDetectionOptions {
236 collect_info: false,
237 detect_packages: false,
238 detect_application_packages: false,
239 detect_system_packages: false,
240 detect_packages_in_compiled: false,
241 detect_copyrights: true,
242 detect_generated: false,
243 detect_emails: true,
244 detect_urls: true,
245 max_emails: 50,
246 max_urls: 50,
247 timeout_seconds: 120.0,
248 };
249 let pem_fixture = concat!(
250 "-----BEGIN CERTIFICATE-----\n",
251 "MIID8TCCAtmgAwIBAgIQQT1yx/RrH4FDffHSKFTfmjANBgkqhkiG9w0BAQUFADCB\n",
252 "ijELMAkGA1UEBhMCQ0gxEDAOBgNVBAoTB1dJU2VLZXkxGzAZBgNVBAsTEkNvcHly\n",
253 "-----END CERTIFICATE-----\n",
254 "Certificate:\n",
255 " Data:\n",
256 " Signature Algorithm: sha1WithRSAEncryption\n",
257 " Issuer: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
258 " Subject: C=CH, O=WISeKey, OU=Copyright (c) 2005, OU=OISTE Foundation Endorsed\n",
259 " Contact: cert-owner@example.com\n",
260 );
261 let scanned = scan_single_file("cert.pem", pem_fixture, &options);
262
263 assert!(
264 scanned.copyrights.is_empty(),
265 "copyrights: {:#?}",
266 scanned.copyrights
267 );
268 assert!(
269 scanned.holders.is_empty(),
270 "holders: {:#?}",
271 scanned.holders
272 );
273 assert!(
274 scanned.authors.is_empty(),
275 "authors: {:#?}",
276 scanned.authors
277 );
278 assert!(scanned.emails.is_empty(), "emails: {:#?}", scanned.emails);
279 assert!(scanned.urls.is_empty(), "urls: {:#?}", scanned.urls);
280 assert!(
281 scanned.license_detections.is_empty(),
282 "licenses: {:#?}",
283 scanned.license_detections
284 );
285 assert!(
286 scanned.license_clues.is_empty(),
287 "license clues: {:#?}",
288 scanned.license_clues
289 );
290 }
291
292 #[test]
293 fn scanner_keeps_source_headers_when_pem_blocks_are_embedded() {
294 let options = TextDetectionOptions {
295 collect_info: false,
296 detect_packages: false,
297 detect_application_packages: false,
298 detect_system_packages: false,
299 detect_packages_in_compiled: false,
300 detect_copyrights: true,
301 detect_generated: false,
302 detect_emails: false,
303 detect_urls: true,
304 max_emails: 50,
305 max_urls: 50,
306 timeout_seconds: 120.0,
307 };
308 let fixture = concat!(
309 "/*\n",
310 "Copyright 2022 The Kubernetes Authors.\n\n",
311 "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
312 "you may not use this file except in compliance with the License.\n",
313 "You may obtain a copy of the License at\n\n",
314 " http://www.apache.org/licenses/LICENSE-2.0\n",
315 "*/\n\n",
316 "package storage\n\n",
317 "const validCert = `\n",
318 "-----BEGIN CERTIFICATE-----\n",
319 "MIIDmTCCAoGgAwIBAgIUWQ==\n",
320 "-----END CERTIFICATE-----\n",
321 "`\n",
322 );
323 let temp_dir = TempDir::new().expect("create temp dir");
324 let file_path = temp_dir.path().join("storage_test.go");
325 fs::write(&file_path, fixture).expect("write fixture");
326
327 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
328 let collected = collect_paths(temp_dir.path(), 0, &[]);
329 let engine =
330 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
331 let result = process_collected(
332 &collected,
333 progress,
334 Some(engine),
335 LicenseScanOptions::default(),
336 &options,
337 );
338 let scanned = result
339 .files
340 .into_iter()
341 .find(|entry| {
342 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
343 })
344 .expect("scanned file entry");
345
346 assert!(
347 scanned
348 .copyrights
349 .iter()
350 .any(|c| c.copyright == "Copyright 2022 The Kubernetes Authors"),
351 "copyrights: {:#?}",
352 scanned.copyrights
353 );
354 assert!(
355 scanned
356 .holders
357 .iter()
358 .any(|h| h.holder == "The Kubernetes Authors"),
359 "holders: {:#?}",
360 scanned.holders
361 );
362 assert!(
363 scanned
364 .urls
365 .iter()
366 .any(|u| u.url == "http://www.apache.org/licenses/LICENSE-2.0"),
367 "urls: {:#?}",
368 scanned.urls
369 );
370 assert_eq!(scanned.license_expression.as_deref(), Some("Apache-2.0"));
371 }
372
373 #[test]
374 fn scanner_detects_structured_credits_authors() {
375 let options = TextDetectionOptions {
376 collect_info: false,
377 detect_packages: false,
378 detect_application_packages: false,
379 detect_system_packages: false,
380 detect_packages_in_compiled: false,
381 detect_copyrights: true,
382 detect_generated: false,
383 detect_emails: false,
384 detect_urls: false,
385 max_emails: 50,
386 max_urls: 50,
387 timeout_seconds: 120.0,
388 };
389 let credits_fixture = concat!(
390 "N: Jack Lloyd\n",
391 "E: lloyd@randombit.net\n",
392 "W: http://www.randombit.net/\n",
393 );
394 let scanned = scan_single_file("CREDITS", credits_fixture, &options);
395
396 let authors: Vec<(&str, usize, usize)> = scanned
397 .authors
398 .iter()
399 .map(|author| (author.author.as_str(), author.start_line, author.end_line))
400 .collect();
401
402 assert_eq!(
403 authors,
404 vec![(
405 "Jack Lloyd lloyd@randombit.net http://www.randombit.net/",
406 1,
407 3,
408 )]
409 );
410 assert!(scanned.copyrights.is_empty());
411 assert!(scanned.holders.is_empty());
412 }
413
414 #[test]
415 fn scanner_uses_or_for_alternative_license_header() {
416 let fixture =
417 include_str!("../../testdata/license-golden/datadriven/external/boost-json-d2s.ipp");
418 let temp_dir = TempDir::new().expect("create temp dir");
419 let file_path = temp_dir.path().join("d2s.ipp");
420 fs::write(&file_path, fixture).expect("write fixture");
421
422 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
423 let collected = collect_paths(temp_dir.path(), 0, &[]);
424 let engine =
425 Arc::new(LicenseDetectionEngine::from_embedded().expect("initialize license engine"));
426 let result = process_collected(
427 &collected,
428 progress,
429 Some(engine),
430 LicenseScanOptions::default(),
431 &TextDetectionOptions::default(),
432 );
433 let scanned = result
434 .files
435 .into_iter()
436 .find(|entry| {
437 entry.file_type == FileType::File && entry.path == file_path.to_string_lossy()
438 })
439 .expect("scanned file entry");
440
441 assert_eq!(
442 scanned.license_expression.as_deref(),
443 Some("Apache-2.0 OR BSL-1.0")
444 );
445 assert!(
446 scanned.license_clues.is_empty(),
447 "license clues: {:#?}",
448 scanned.license_clues
449 );
450 assert_eq!(
451 scanned.license_detections.len(),
452 1,
453 "detections: {:#?}",
454 scanned.license_detections
455 );
456
457 let detection = &scanned.license_detections[0];
458 assert_eq!(detection.license_expression_spdx, "Apache-2.0 OR BSL-1.0");
459
460 let match_expressions: Vec<_> = detection
461 .matches
462 .iter()
463 .map(|m| m.license_expression_spdx.as_str())
464 .collect();
465 assert_eq!(match_expressions, vec!["Apache-2.0", "BSL-1.0"]);
466 }
467
468 #[test]
469 fn scanner_sets_generated_flag_when_enabled() {
470 let options = TextDetectionOptions {
471 collect_info: false,
472 detect_packages: false,
473 detect_application_packages: false,
474 detect_system_packages: false,
475 detect_packages_in_compiled: false,
476 detect_copyrights: false,
477 detect_generated: true,
478 detect_emails: false,
479 detect_urls: false,
480 max_emails: 50,
481 max_urls: 50,
482 timeout_seconds: 120.0,
483 };
484 let scanned = scan_single_file(
485 "generated.c",
486 "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
487 &options,
488 );
489
490 assert_eq!(scanned.is_generated, Some(true));
491 }
492
493 #[test]
494 fn scanner_leaves_generated_flag_unset_when_disabled() {
495 let options = TextDetectionOptions {
496 collect_info: false,
497 detect_packages: false,
498 detect_application_packages: false,
499 detect_system_packages: false,
500 detect_packages_in_compiled: false,
501 detect_copyrights: false,
502 detect_generated: false,
503 detect_emails: false,
504 detect_urls: false,
505 max_emails: 50,
506 max_urls: 50,
507 timeout_seconds: 120.0,
508 };
509 let scanned = scan_single_file(
510 "generated.c",
511 "/* DO NOT EDIT THIS FILE - it is machine generated */\n",
512 &options,
513 );
514
515 assert_eq!(scanned.is_generated, None);
516 }
517
518 #[test]
519 fn scanner_populates_info_surface_when_enabled() {
520 let options = TextDetectionOptions {
521 collect_info: true,
522 detect_packages: false,
523 detect_application_packages: false,
524 detect_system_packages: false,
525 detect_packages_in_compiled: false,
526 detect_copyrights: false,
527 detect_generated: false,
528 detect_emails: false,
529 detect_urls: false,
530 max_emails: 50,
531 max_urls: 50,
532 timeout_seconds: 120.0,
533 };
534 let scanned = scan_single_file(
535 "script.py",
536 "#!/usr/bin/env python3\nprint(\"hello\")\n",
537 &options,
538 );
539
540 assert!(scanned.sha1.is_some());
541 assert!(scanned.md5.is_some());
542 assert!(scanned.sha256.is_some());
543 assert!(scanned.sha1_git.is_some());
544 assert!(scanned.mime_type.is_some());
545 assert!(scanned.date.is_some());
546 assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
547 assert_eq!(scanned.is_text, Some(true));
548 assert_eq!(scanned.is_script, Some(true));
549 assert_eq!(scanned.is_source, Some(true));
550 }
551
552 #[test]
553 fn scanner_treats_latin1_python_sources_as_textual_scripts() {
554 let options = TextDetectionOptions {
555 collect_info: true,
556 detect_packages: false,
557 detect_application_packages: false,
558 detect_system_packages: false,
559 detect_packages_in_compiled: false,
560 detect_copyrights: false,
561 detect_generated: false,
562 detect_emails: false,
563 detect_urls: false,
564 max_emails: 50,
565 max_urls: 50,
566 timeout_seconds: 120.0,
567 };
568 let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
569 let scanned = scan_file_at_relative_path("script.py", latin1_python, &options);
570
571 assert_eq!(scanned.programming_language.as_deref(), Some("Python"));
572 assert_eq!(
573 scanned.file_type_label.as_deref(),
574 Some("python script, text executable")
575 );
576 assert_eq!(scanned.is_binary, Some(false));
577 assert_eq!(scanned.is_text, Some(true));
578 assert_eq!(scanned.is_script, Some(true));
579 assert_eq!(scanned.is_source, Some(true));
580 }
581
582 #[test]
583 fn scanner_skips_findings_for_zip_like_archives() {
584 let options = TextDetectionOptions {
585 collect_info: true,
586 detect_packages: false,
587 detect_application_packages: false,
588 detect_system_packages: false,
589 detect_packages_in_compiled: false,
590 detect_copyrights: true,
591 detect_generated: false,
592 detect_emails: true,
593 detect_urls: true,
594 max_emails: 50,
595 max_urls: 50,
596 timeout_seconds: 120.0,
597 };
598 let archive_like = b"PK\x03\x04\x14\x00\x00\x00\x08\x00MIT License\ncontact@example.com\nhttps://example.com\n";
599 let scanned = scan_file_at_relative_path("demo.whl", archive_like, &options);
600
601 assert_eq!(scanned.mime_type.as_deref(), Some("application/zip"));
602 assert_eq!(scanned.is_archive, Some(true));
603 assert!(scanned.license_detections.is_empty());
604 assert!(scanned.copyrights.is_empty());
605 assert!(scanned.emails.is_empty());
606 assert!(scanned.urls.is_empty());
607 }
608
609 #[test]
610 fn scanner_treats_typescript_sources_as_text_not_video_media() {
611 let options = TextDetectionOptions {
612 collect_info: true,
613 detect_packages: false,
614 detect_application_packages: false,
615 detect_system_packages: false,
616 detect_packages_in_compiled: false,
617 detect_copyrights: false,
618 detect_generated: false,
619 detect_emails: false,
620 detect_urls: false,
621 max_emails: 50,
622 max_urls: 50,
623 timeout_seconds: 120.0,
624 };
625 let scanned = scan_single_file("main.ts", "export const answer: number = 42;\n", &options);
626
627 assert_eq!(scanned.programming_language.as_deref(), Some("TypeScript"));
628 assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
629 assert_eq!(
630 scanned.file_type_label.as_deref(),
631 Some("UTF-8 Unicode text")
632 );
633 assert_eq!(scanned.is_text, Some(true));
634 assert_eq!(scanned.is_media, Some(false));
635 assert_eq!(scanned.is_script, Some(false));
636 assert_eq!(scanned.is_source, Some(true));
637 }
638
639 #[test]
640 fn scanner_normalizes_sparse_ts_files_away_from_video_mime() {
641 let options = TextDetectionOptions {
642 collect_info: true,
643 detect_packages: false,
644 detect_application_packages: false,
645 detect_system_packages: false,
646 detect_packages_in_compiled: false,
647 detect_copyrights: false,
648 detect_generated: false,
649 detect_emails: false,
650 detect_urls: false,
651 max_emails: 50,
652 max_urls: 50,
653 timeout_seconds: 120.0,
654 };
655 let scanned = scan_single_file("main.ts", "// comment-only TypeScript fixture\n", &options);
656
657 assert_eq!(scanned.mime_type.as_deref(), Some("text/plain"));
658 assert_eq!(
659 scanned.file_type_label.as_deref(),
660 Some("UTF-8 Unicode text")
661 );
662 assert_eq!(scanned.is_text, Some(true));
663 assert_eq!(scanned.is_media, Some(false));
664 assert_eq!(scanned.is_script, Some(false));
665 assert_eq!(scanned.is_source, Some(true));
666 }
667
668 #[test]
669 fn scanner_treats_empty_files_like_scancode_info_surface() {
670 let options = TextDetectionOptions {
671 collect_info: true,
672 detect_packages: false,
673 detect_application_packages: false,
674 detect_system_packages: false,
675 detect_packages_in_compiled: false,
676 detect_copyrights: false,
677 detect_generated: false,
678 detect_emails: false,
679 detect_urls: false,
680 max_emails: 50,
681 max_urls: 50,
682 timeout_seconds: 120.0,
683 };
684 let scanned = scan_single_file("test.txt", "", &options);
685
686 assert_eq!(scanned.mime_type.as_deref(), Some("inode/x-empty"));
687 assert_eq!(scanned.file_type_label.as_deref(), Some("empty"));
688 assert_eq!(scanned.programming_language, None);
689 assert_eq!(scanned.is_binary, Some(false));
690 assert_eq!(scanned.is_text, Some(true));
691 assert_eq!(scanned.is_archive, Some(false));
692 assert_eq!(scanned.is_media, Some(false));
693 assert_eq!(scanned.is_source, Some(false));
694 assert_eq!(scanned.is_script, Some(false));
695 }
696
697 #[test]
698 fn scanner_treats_package_json_as_text_not_source() {
699 let options = TextDetectionOptions {
700 collect_info: true,
701 detect_packages: false,
702 detect_application_packages: false,
703 detect_system_packages: false,
704 detect_packages_in_compiled: false,
705 detect_copyrights: false,
706 detect_generated: false,
707 detect_emails: false,
708 detect_urls: false,
709 max_emails: 50,
710 max_urls: 50,
711 timeout_seconds: 120.0,
712 };
713 let scanned = scan_single_file("package.json", r#"{"name":"demo"}"#, &options);
714
715 assert_eq!(scanned.mime_type.as_deref(), Some("application/json"));
716 assert_eq!(scanned.file_type_label.as_deref(), Some("JSON text data"));
717 assert_eq!(scanned.programming_language, None);
718 assert_eq!(scanned.is_text, Some(true));
719 assert_eq!(scanned.is_source, Some(false));
720 assert_eq!(scanned.is_script, Some(false));
721 }
722
723 #[test]
724 fn scanner_classifies_gradle_and_nix_manifests_as_source() {
725 let options = TextDetectionOptions {
726 collect_info: true,
727 detect_packages: false,
728 detect_application_packages: false,
729 detect_system_packages: false,
730 detect_packages_in_compiled: false,
731 detect_copyrights: false,
732 detect_generated: false,
733 detect_emails: false,
734 detect_urls: false,
735 max_emails: 50,
736 max_urls: 50,
737 timeout_seconds: 120.0,
738 };
739
740 let gradle = scan_single_file("build.gradle", "plugins { id 'java' }\n", &options);
741 let nix = scan_single_file("flake.nix", "{ inputs, ... }: {}\n", &options);
742
743 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
744 assert_eq!(gradle.mime_type.as_deref(), Some("text/plain"));
745 assert_eq!(gradle.is_source, Some(true));
746 assert_eq!(gradle.is_script, Some(false));
747
748 assert_eq!(nix.programming_language.as_deref(), Some("Nix"));
749 assert_eq!(nix.mime_type.as_deref(), Some("text/plain"));
750 assert_eq!(nix.is_source, Some(true));
751 assert_eq!(nix.is_script, Some(false));
752 }
753
754 #[test]
755 fn scanner_treats_gitmodules_as_text_not_source() {
756 let options = TextDetectionOptions {
757 collect_info: true,
758 detect_packages: false,
759 detect_application_packages: false,
760 detect_system_packages: false,
761 detect_packages_in_compiled: false,
762 detect_copyrights: false,
763 detect_generated: false,
764 detect_emails: false,
765 detect_urls: false,
766 max_emails: 50,
767 max_urls: 50,
768 timeout_seconds: 120.0,
769 };
770 let scanned = scan_file_at_relative_path(
771 ".gitmodules",
772 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
773 &options,
774 );
775
776 assert_eq!(scanned.programming_language, None);
777 assert_eq!(
778 scanned.file_type_label.as_deref(),
779 Some("Git configuration text")
780 );
781 assert_eq!(scanned.is_text, Some(true));
782 assert_eq!(scanned.is_source, Some(false));
783 assert_eq!(scanned.is_script, Some(false));
784 }
785
786 #[test]
787 fn scanner_treats_javascript_shebang_files_as_scripts() {
788 let options = TextDetectionOptions {
789 collect_info: true,
790 detect_packages: false,
791 detect_application_packages: false,
792 detect_system_packages: false,
793 detect_packages_in_compiled: false,
794 detect_copyrights: false,
795 detect_generated: false,
796 detect_emails: false,
797 detect_urls: false,
798 max_emails: 50,
799 max_urls: 50,
800 timeout_seconds: 120.0,
801 };
802 let scanned = scan_file_at_relative_path(
803 "bin/run",
804 b"#!/usr/bin/env node\nconsole.log('hello');\n",
805 &options,
806 );
807
808 assert_eq!(scanned.programming_language.as_deref(), Some("JavaScript"));
809 assert_eq!(
810 scanned.file_type_label.as_deref(),
811 Some("javascript script, UTF-8 Unicode text executable")
812 );
813 assert_eq!(scanned.is_script, Some(true));
814 assert_eq!(scanned.is_source, Some(true));
815 }
816
817 #[test]
818 fn scanner_treats_dockerfile_as_source() {
819 let options = TextDetectionOptions {
820 collect_info: true,
821 detect_packages: false,
822 detect_application_packages: false,
823 detect_system_packages: false,
824 detect_packages_in_compiled: false,
825 detect_copyrights: false,
826 detect_generated: false,
827 detect_emails: false,
828 detect_urls: false,
829 max_emails: 50,
830 max_urls: 50,
831 timeout_seconds: 120.0,
832 };
833 let scanned = scan_single_file("Dockerfile", "FROM scratch\n", &options);
834
835 assert_eq!(scanned.programming_language.as_deref(), Some("Dockerfile"));
836 assert_eq!(
837 scanned.file_type_label.as_deref(),
838 Some("UTF-8 Unicode text")
839 );
840 assert_eq!(scanned.is_source, Some(true));
841 assert_eq!(scanned.is_script, Some(false));
842 }
843
844 #[test]
845 fn scanner_treats_makefile_as_text_not_source() {
846 let options = TextDetectionOptions {
847 collect_info: true,
848 detect_packages: false,
849 detect_application_packages: false,
850 detect_system_packages: false,
851 detect_packages_in_compiled: false,
852 detect_copyrights: false,
853 detect_generated: false,
854 detect_emails: false,
855 detect_urls: false,
856 max_emails: 50,
857 max_urls: 50,
858 timeout_seconds: 120.0,
859 };
860 let scanned = scan_single_file("Makefile", "all:\n\techo hi\n", &options);
861
862 assert_eq!(scanned.programming_language, None);
863 assert_eq!(
864 scanned.file_type_label.as_deref(),
865 Some("UTF-8 Unicode text")
866 );
867 assert_eq!(scanned.is_text, Some(true));
868 assert_eq!(scanned.is_source, Some(false));
869 assert_eq!(scanned.is_script, Some(false));
870 }
871
872 #[test]
873 fn scanner_omits_info_surface_when_disabled() {
874 let options = TextDetectionOptions {
875 collect_info: false,
876 detect_packages: false,
877 detect_application_packages: false,
878 detect_system_packages: false,
879 detect_packages_in_compiled: false,
880 detect_copyrights: false,
881 detect_generated: false,
882 detect_emails: false,
883 detect_urls: false,
884 max_emails: 50,
885 max_urls: 50,
886 timeout_seconds: 120.0,
887 };
888 let scanned = scan_single_file(
889 "script.py",
890 "#!/usr/bin/env python3\nprint(\"hello\")\n",
891 &options,
892 );
893
894 assert!(scanned.sha1.is_none());
895 assert!(scanned.md5.is_none());
896 assert!(scanned.sha256.is_none());
897 assert!(scanned.sha1_git.is_none());
898 assert!(scanned.mime_type.is_none());
899 assert!(scanned.date.is_none());
900 assert!(scanned.programming_language.is_none());
901 assert!(scanned.is_binary.is_none());
902 assert!(scanned.is_text.is_none());
903 assert!(scanned.is_archive.is_none());
904 assert!(scanned.is_media.is_none());
905 assert!(scanned.is_script.is_none());
906 assert!(scanned.is_source.is_none());
907 }
908
909 #[test]
910 fn scanner_skips_package_parsing_when_disabled() {
911 let options = TextDetectionOptions {
912 collect_info: false,
913 detect_packages: false,
914 detect_application_packages: false,
915 detect_system_packages: false,
916 detect_packages_in_compiled: false,
917 detect_copyrights: false,
918 detect_generated: false,
919 detect_emails: false,
920 detect_urls: false,
921 max_emails: 50,
922 max_urls: 50,
923 timeout_seconds: 120.0,
924 };
925 let scanned = scan_single_file(
926 "package.json",
927 r#"{"name":"demo","version":"1.0.0"}"#,
928 &options,
929 );
930
931 assert!(
932 scanned.package_data.is_empty(),
933 "package_data: {:#?}",
934 scanned.package_data
935 );
936 }
937
938 #[test]
939 fn scanner_parses_package_manifests_when_enabled() {
940 let options = TextDetectionOptions {
941 collect_info: false,
942 detect_packages: true,
943 detect_application_packages: true,
944 detect_system_packages: false,
945 detect_packages_in_compiled: false,
946 detect_copyrights: false,
947 detect_generated: false,
948 detect_emails: false,
949 detect_urls: false,
950 max_emails: 50,
951 max_urls: 50,
952 timeout_seconds: 120.0,
953 };
954 let scanned = scan_single_file(
955 "package.json",
956 r#"{"name":"demo","version":"1.0.0"}"#,
957 &options,
958 );
959
960 assert_eq!(
961 scanned.package_data.len(),
962 1,
963 "package_data: {:#?}",
964 scanned.package_data
965 );
966 }
967
968 #[test]
969 fn scanner_skips_application_packages_when_only_system_packages_enabled() {
970 let options = TextDetectionOptions {
971 collect_info: false,
972 detect_packages: true,
973 detect_application_packages: false,
974 detect_system_packages: true,
975 detect_packages_in_compiled: false,
976 detect_copyrights: false,
977 detect_generated: false,
978 detect_emails: false,
979 detect_urls: false,
980 max_emails: 50,
981 max_urls: 50,
982 timeout_seconds: 120.0,
983 };
984 let scanned = scan_single_file(
985 "package.json",
986 r#"{"name":"demo","version":"1.0.0"}"#,
987 &options,
988 );
989
990 assert!(
991 scanned.package_data.is_empty(),
992 "package_data: {:#?}",
993 scanned.package_data
994 );
995 }
996
997 #[test]
998 fn scanner_parses_system_package_files_when_enabled() {
999 let options = TextDetectionOptions {
1000 collect_info: false,
1001 detect_packages: true,
1002 detect_application_packages: false,
1003 detect_system_packages: true,
1004 detect_packages_in_compiled: false,
1005 detect_copyrights: false,
1006 detect_generated: false,
1007 detect_emails: false,
1008 detect_urls: false,
1009 max_emails: 50,
1010 max_urls: 50,
1011 timeout_seconds: 120.0,
1012 };
1013 let scanned = scan_file_at_relative_path(
1014 "var/lib/dpkg/status",
1015 b"Package: demo\nVersion: 1.0\nArchitecture: all\nDescription: demo package\n\n",
1016 &options,
1017 );
1018
1019 assert!(
1020 !scanned.package_data.is_empty(),
1021 "package_data: {:#?}",
1022 scanned.package_data
1023 );
1024 }
1025
1026 #[test]
1027 fn scanner_only_parses_compiled_packages_when_package_in_compiled_is_enabled() {
1028 let temp_dir = TempDir::new().expect("create temp dir");
1029 fs::write(
1030 temp_dir.path().join("go.mod"),
1031 "module example.com/demo\n\ngo 1.23.0\n",
1032 )
1033 .expect("write go.mod");
1034 fs::write(
1035 temp_dir.path().join("main.go"),
1036 "package main\nfunc main() {}\n",
1037 )
1038 .expect("write main.go");
1039 let file_path = temp_dir.path().join("demo");
1040 let status = std::process::Command::new("go")
1041 .current_dir(temp_dir.path())
1042 .args(["build", "-o"])
1043 .arg(&file_path)
1044 .status()
1045 .expect("run go build");
1046 assert!(status.success());
1047
1048 let progress = Arc::new(ScanProgress::new(ProgressMode::Quiet));
1049 let collected = collect_paths(temp_dir.path(), 0, &[]);
1050
1051 let without_compiled = process_collected(
1052 &collected,
1053 Arc::clone(&progress),
1054 None,
1055 LicenseScanOptions::default(),
1056 &TextDetectionOptions {
1057 collect_info: false,
1058 detect_packages: true,
1059 detect_application_packages: true,
1060 detect_system_packages: false,
1061 detect_packages_in_compiled: false,
1062 detect_copyrights: false,
1063 detect_generated: false,
1064 detect_emails: false,
1065 detect_urls: false,
1066 max_emails: 50,
1067 max_urls: 50,
1068 timeout_seconds: 120.0,
1069 },
1070 );
1071 let with_compiled = process_collected(
1072 &collected,
1073 progress,
1074 None,
1075 LicenseScanOptions::default(),
1076 &TextDetectionOptions {
1077 collect_info: false,
1078 detect_packages: true,
1079 detect_application_packages: true,
1080 detect_system_packages: false,
1081 detect_packages_in_compiled: true,
1082 detect_copyrights: false,
1083 detect_generated: false,
1084 detect_emails: false,
1085 detect_urls: false,
1086 max_emails: 50,
1087 max_urls: 50,
1088 timeout_seconds: 120.0,
1089 },
1090 );
1091
1092 let without_compiled = without_compiled
1093 .files
1094 .into_iter()
1095 .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1096 .expect("compiled artifact present");
1097 let with_compiled = with_compiled
1098 .files
1099 .into_iter()
1100 .find(|entry| entry.file_type == FileType::File && entry.path.ends_with("/demo"))
1101 .expect("compiled artifact present");
1102
1103 assert!(
1104 without_compiled.package_data.is_empty(),
1105 "package_data: {:#?}",
1106 without_compiled.package_data
1107 );
1108 assert!(!with_compiled.package_data.is_empty());
1109 }
1110
1111 #[test]
1112 fn scanner_sets_is_source_only_when_info_enabled() {
1113 let without_info = TextDetectionOptions {
1114 collect_info: false,
1115 detect_packages: false,
1116 detect_application_packages: false,
1117 detect_system_packages: false,
1118 detect_packages_in_compiled: false,
1119 detect_copyrights: false,
1120 detect_generated: false,
1121 detect_emails: false,
1122 detect_urls: false,
1123 max_emails: 50,
1124 max_urls: 50,
1125 timeout_seconds: 120.0,
1126 };
1127 let with_info = TextDetectionOptions {
1128 collect_info: true,
1129 ..without_info.clone()
1130 };
1131
1132 let scanned_without_info = scan_single_file("main.rs", "fn main() {}\n", &without_info);
1133 let scanned_with_info = scan_single_file("main.rs", "fn main() {}\n", &with_info);
1134
1135 assert_eq!(scanned_without_info.is_source, None);
1136 assert_eq!(scanned_with_info.is_source, Some(true));
1137 }
1138
1139 #[test]
1140 fn directory_omits_info_fields_when_info_disabled() {
1141 let temp_dir = TempDir::new().expect("create temp dir");
1142 fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1143
1144 let collected = collect_paths(temp_dir.path(), 0, &[]);
1145 let result = process_collected(
1146 &collected,
1147 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1148 None,
1149 LicenseScanOptions::default(),
1150 &TextDetectionOptions {
1151 collect_info: false,
1152 detect_packages: false,
1153 detect_application_packages: false,
1154 detect_system_packages: false,
1155 detect_packages_in_compiled: false,
1156 detect_copyrights: false,
1157 detect_generated: false,
1158 detect_emails: false,
1159 detect_urls: false,
1160 max_emails: 50,
1161 max_urls: 50,
1162 timeout_seconds: 120.0,
1163 },
1164 );
1165
1166 let directory = result
1167 .files
1168 .into_iter()
1169 .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1170 .expect("directory entry");
1171
1172 assert!(directory.date.is_none());
1173 assert!(directory.file_type_label.is_none());
1174 assert!(directory.is_binary.is_none());
1175 assert!(directory.is_text.is_none());
1176 assert!(directory.is_archive.is_none());
1177 assert!(directory.is_media.is_none());
1178 assert!(directory.is_source.is_none());
1179 assert!(directory.is_script.is_none());
1180 }
1181
1182 #[test]
1183 fn directory_includes_info_fields_when_info_enabled() {
1184 let temp_dir = TempDir::new().expect("create temp dir");
1185 fs::create_dir_all(temp_dir.path().join("nested")).expect("create nested dir");
1186
1187 let collected = collect_paths(temp_dir.path(), 0, &[]);
1188 let result = process_collected(
1189 &collected,
1190 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1191 None,
1192 LicenseScanOptions::default(),
1193 &TextDetectionOptions {
1194 collect_info: true,
1195 detect_packages: false,
1196 detect_application_packages: false,
1197 detect_system_packages: false,
1198 detect_packages_in_compiled: false,
1199 detect_copyrights: false,
1200 detect_generated: false,
1201 detect_emails: false,
1202 detect_urls: false,
1203 max_emails: 50,
1204 max_urls: 50,
1205 timeout_seconds: 120.0,
1206 },
1207 );
1208
1209 let directory = result
1210 .files
1211 .into_iter()
1212 .find(|entry| entry.file_type == FileType::Directory && entry.path.ends_with("nested"))
1213 .expect("directory entry");
1214
1215 assert!(directory.date.is_none());
1216 assert!(directory.file_type_label.is_none());
1217 assert_eq!(directory.is_binary, Some(false));
1218 assert_eq!(directory.is_text, Some(false));
1219 assert_eq!(directory.is_archive, Some(false));
1220 assert_eq!(directory.is_media, Some(false));
1221 assert_eq!(directory.is_source, Some(false));
1222 assert_eq!(directory.is_script, Some(false));
1223 assert_eq!(directory.files_count, Some(0));
1224 assert_eq!(directory.dirs_count, Some(0));
1225 assert_eq!(directory.size_count, Some(0));
1226 }
1227
1228 #[test]
1229 fn collect_paths_includes_root_directory_entry() {
1230 let temp_dir = TempDir::new().expect("create temp dir");
1231 fs::create_dir_all(temp_dir.path().join("src")).expect("create nested dir");
1232 fs::write(temp_dir.path().join("src").join("main.rs"), "fn main() {}")
1233 .expect("write nested file");
1234
1235 let collected = collect_paths(temp_dir.path(), 0, &[]);
1236
1237 assert!(
1238 collected
1239 .directories
1240 .iter()
1241 .any(|(path, _)| path == temp_dir.path())
1242 );
1243 }
1244
1245 #[test]
1246 fn collect_paths_supports_single_file_input() {
1247 let temp_dir = TempDir::new().expect("create temp dir");
1248 let file_path = temp_dir.path().join("main.rs");
1249 fs::write(&file_path, "fn main() {}\n").expect("write file");
1250
1251 let collected = collect_paths(&file_path, 0, &[]);
1252
1253 assert_eq!(collected.files.len(), 1);
1254 assert!(collected.directories.is_empty());
1255 assert_eq!(collected.files[0].0, file_path);
1256 }
1257
1258 #[test]
1259 fn process_collected_with_memory_limit_preserves_results_when_spilling() {
1260 let temp_dir = TempDir::new().expect("create temp dir");
1261 fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1262 fs::write(temp_dir.path().join("b.txt"), "world").expect("write second file");
1263
1264 let collected = collect_paths(temp_dir.path(), 0, &[]);
1265 let result = process_collected_with_memory_limit(
1266 &collected,
1267 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1268 None,
1269 LicenseScanOptions::default(),
1270 &TextDetectionOptions {
1271 collect_info: false,
1272 detect_packages: false,
1273 detect_application_packages: false,
1274 detect_system_packages: false,
1275 detect_packages_in_compiled: false,
1276 detect_copyrights: false,
1277 detect_generated: false,
1278 detect_emails: false,
1279 detect_urls: false,
1280 max_emails: 50,
1281 max_urls: 50,
1282 timeout_seconds: 120.0,
1283 },
1284 1,
1285 );
1286
1287 assert_eq!(result.files.len(), 3);
1288 }
1289
1290 #[test]
1291 fn process_collected_with_negative_one_uses_disk_only_mode() {
1292 let temp_dir = TempDir::new().expect("create temp dir");
1293 fs::write(temp_dir.path().join("a.txt"), "hello").expect("write first file");
1294
1295 let collected = collect_paths(temp_dir.path(), 0, &[]);
1296 let result = process_collected_with_memory_limit(
1297 &collected,
1298 Arc::new(ScanProgress::new(ProgressMode::Quiet)),
1299 None,
1300 LicenseScanOptions::default(),
1301 &TextDetectionOptions {
1302 collect_info: false,
1303 detect_packages: false,
1304 detect_application_packages: false,
1305 detect_system_packages: false,
1306 detect_packages_in_compiled: false,
1307 detect_copyrights: false,
1308 detect_generated: false,
1309 detect_emails: false,
1310 detect_urls: false,
1311 max_emails: 50,
1312 max_urls: 50,
1313 timeout_seconds: 120.0,
1314 },
1315 -1,
1316 );
1317
1318 assert_eq!(result.files.len(), 2);
1319 }
1320}