1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3 is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7
8use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
9use crate::utils::text::{
10 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
11};
12use anyhow::Error;
13use rayon::prelude::*;
14use std::collections::HashSet;
15use std::fs::{self, File};
16use std::io::{Read, Write};
17use std::path::Path;
18use std::sync::Arc;
19use std::time::{Duration, Instant};
20
21use crate::copyright::{
22 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
23};
24use crate::finder::{self, DetectionConfig};
25use crate::license_detection::PositionSet;
26use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
27use crate::license_detection::query::Query;
28use crate::models::{
29 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
30 LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
31};
32use crate::parsers::utils::split_name_email;
33use crate::progress::ScanProgress;
34use crate::scanner::collect::CollectedPaths;
35use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
36use crate::utils::file::{
37 ExtractedTextKind, augment_license_detection_text, classify_file_info,
38 extract_text_for_detection_with_diagnostics, get_creation_date,
39};
40use crate::utils::generated::generated_code_hints_from_bytes;
41use tempfile::TempDir;
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum MemoryMode {
45 CollectFirst,
46 StreamUnlimited,
47 Limit(usize),
48}
49
50impl std::fmt::Display for MemoryMode {
51 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52 match self {
53 MemoryMode::CollectFirst => write!(f, "0"),
54 MemoryMode::StreamUnlimited => write!(f, "-1"),
55 MemoryMode::Limit(n) => write!(f, "{n}"),
56 }
57 }
58}
59
60const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
61 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
62 (
63 "-----BEGIN TRUSTED CERTIFICATE-----",
64 "-----END TRUSTED CERTIFICATE-----",
65 ),
66];
67
68const LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES: usize = 128 * 1024;
69
70pub fn process_collected(
71 collected: &CollectedPaths,
72 progress: Arc<ScanProgress>,
73 license_engine: Option<Arc<LicenseDetectionEngine>>,
74 license_options: LicenseScanOptions,
75 text_options: &TextDetectionOptions,
76) -> ProcessResult {
77 let mut all_files: Vec<FileInfo> = collected
78 .files
79 .par_iter()
80 .map(|(path, metadata)| {
81 let file_entry = process_file(
82 path,
83 metadata,
84 progress.as_ref(),
85 license_engine.clone(),
86 license_options,
87 text_options,
88 );
89 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
90 file_entry
91 })
92 .collect();
93
94 for (path, metadata) in &collected.directories {
95 all_files.push(process_directory(
96 path,
97 metadata,
98 text_options.collect_info,
99 license_engine.is_some(),
100 ));
101 }
102
103 ProcessResult {
104 files: all_files,
105 excluded_count: collected.excluded_count,
106 }
107}
108
109pub fn process_collected_sequential(
110 collected: &CollectedPaths,
111 progress: Arc<ScanProgress>,
112 license_engine: Option<Arc<LicenseDetectionEngine>>,
113 license_options: LicenseScanOptions,
114 text_options: &TextDetectionOptions,
115) -> ProcessResult {
116 let mut all_files: Vec<FileInfo> =
117 Vec::with_capacity(collected.files.len() + collected.directories.len());
118
119 for (path, metadata) in &collected.files {
120 let file_entry = process_file(
121 path,
122 metadata,
123 progress.as_ref(),
124 license_engine.clone(),
125 license_options,
126 text_options,
127 );
128 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
129 all_files.push(file_entry);
130 }
131
132 for (path, metadata) in &collected.directories {
133 all_files.push(process_directory(
134 path,
135 metadata,
136 text_options.collect_info,
137 license_engine.is_some(),
138 ));
139 }
140
141 ProcessResult {
142 files: all_files,
143 excluded_count: collected.excluded_count,
144 }
145}
146
147pub fn process_collected_with_memory_limit(
148 collected: &CollectedPaths,
149 progress: Arc<ScanProgress>,
150 license_engine: Option<Arc<LicenseDetectionEngine>>,
151 license_options: LicenseScanOptions,
152 text_options: &TextDetectionOptions,
153 max_in_memory: MemoryMode,
154) -> ProcessResult {
155 match max_in_memory {
156 MemoryMode::CollectFirst => {
157 return process_collected(
158 collected,
159 progress,
160 license_engine,
161 license_options,
162 text_options,
163 );
164 }
165 MemoryMode::StreamUnlimited => {}
166 MemoryMode::Limit(_) => {}
167 }
168
169 let (memory_limit, chunk_size) = match max_in_memory {
170 MemoryMode::CollectFirst => unreachable!(),
171 MemoryMode::StreamUnlimited => (0, 256),
172 MemoryMode::Limit(n) => (n, n.max(1)),
173 };
174
175 let mut retained_files = Vec::new();
176 let mut spill_store = None;
177
178 for chunk in collected.files.chunks(chunk_size) {
179 let processed_chunk: Vec<FileInfo> = chunk
180 .par_iter()
181 .map(|(path, metadata)| {
182 let file_entry = process_file(
183 path,
184 metadata,
185 progress.as_ref(),
186 license_engine.clone(),
187 license_options,
188 text_options,
189 );
190 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
191 file_entry
192 })
193 .collect();
194
195 retain_or_spill_chunk(
196 processed_chunk,
197 &mut retained_files,
198 &mut spill_store,
199 memory_limit,
200 );
201 }
202
203 for (path, metadata) in &collected.directories {
204 let entry = process_directory(
205 path,
206 metadata,
207 text_options.collect_info,
208 license_engine.is_some(),
209 );
210 retain_or_spill_chunk(
211 vec![entry],
212 &mut retained_files,
213 &mut spill_store,
214 memory_limit,
215 );
216 }
217
218 if let Some(spill_store) = spill_store {
219 retained_files.extend(spill_store.load_all());
220 }
221
222 ProcessResult {
223 files: retained_files,
224 excluded_count: collected.excluded_count,
225 }
226}
227
228pub fn process_collected_with_memory_limit_sequential(
229 collected: &CollectedPaths,
230 progress: Arc<ScanProgress>,
231 license_engine: Option<Arc<LicenseDetectionEngine>>,
232 license_options: LicenseScanOptions,
233 text_options: &TextDetectionOptions,
234 max_in_memory: MemoryMode,
235) -> ProcessResult {
236 match max_in_memory {
237 MemoryMode::CollectFirst => {
238 return process_collected_sequential(
239 collected,
240 progress,
241 license_engine,
242 license_options,
243 text_options,
244 );
245 }
246 MemoryMode::StreamUnlimited => {}
247 MemoryMode::Limit(_) => {}
248 }
249
250 let (memory_limit, chunk_size) = match max_in_memory {
251 MemoryMode::CollectFirst => unreachable!(),
252 MemoryMode::StreamUnlimited => (0, 256),
253 MemoryMode::Limit(n) => (n, n.max(1)),
254 };
255
256 let mut retained_files = Vec::new();
257 let mut spill_store = None;
258
259 for chunk in collected.files.chunks(chunk_size) {
260 let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
261 for (path, metadata) in chunk {
262 let file_entry = process_file(
263 path,
264 metadata,
265 progress.as_ref(),
266 license_engine.clone(),
267 license_options,
268 text_options,
269 );
270 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
271 processed_chunk.push(file_entry);
272 }
273
274 retain_or_spill_chunk(
275 processed_chunk,
276 &mut retained_files,
277 &mut spill_store,
278 memory_limit,
279 );
280 }
281
282 for (path, metadata) in &collected.directories {
283 let entry = process_directory(
284 path,
285 metadata,
286 text_options.collect_info,
287 license_engine.is_some(),
288 );
289 retain_or_spill_chunk(
290 vec![entry],
291 &mut retained_files,
292 &mut spill_store,
293 memory_limit,
294 );
295 }
296
297 if let Some(spill_store) = spill_store {
298 retained_files.extend(spill_store.load_all());
299 }
300
301 ProcessResult {
302 files: retained_files,
303 excluded_count: collected.excluded_count,
304 }
305}
306
307fn retain_or_spill_chunk(
308 chunk: Vec<FileInfo>,
309 retained_files: &mut Vec<FileInfo>,
310 spill_store: &mut Option<FileInfoSpillStore>,
311 memory_limit: usize,
312) {
313 if memory_limit == 0 {
314 spill_store
315 .get_or_insert_with(FileInfoSpillStore::new)
316 .spill(chunk);
317 return;
318 }
319
320 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
321 if remaining_capacity >= chunk.len() && spill_store.is_none() {
322 retained_files.extend(chunk);
323 return;
324 }
325
326 let mut chunk_iter = chunk.into_iter();
327 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
328 let overflow: Vec<FileInfo> = chunk_iter.collect();
329 if !overflow.is_empty() {
330 spill_store
331 .get_or_insert_with(FileInfoSpillStore::new)
332 .spill(overflow);
333 }
334}
335
336struct FileInfoSpillStore {
337 temp_dir: TempDir,
338 batch_index: usize,
339}
340
341impl FileInfoSpillStore {
342 fn new() -> Self {
343 Self {
344 temp_dir: TempDir::new().expect("create spill dir"),
345 batch_index: 0,
346 }
347 }
348
349 fn spill(&mut self, files: Vec<FileInfo>) {
350 let path = self
351 .temp_dir
352 .path()
353 .join(format!("batch-{:06}.json.zst", self.batch_index));
354 self.batch_index += 1;
355
356 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
357 let file = File::create(path).expect("create spill batch file");
358 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
359 encoder
360 .write_all(&payload)
361 .expect("write spilled file batch");
362 encoder.finish().expect("finish spill encoder");
363 }
364
365 fn load_all(self) -> Vec<FileInfo> {
366 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
367 .expect("read spill dir")
368 .filter_map(Result::ok)
369 .map(|entry| entry.path())
370 .collect();
371 paths.sort();
372
373 let mut files = Vec::new();
374 for path in paths {
375 let file = File::open(path).expect("open spill batch");
376 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
377 let mut payload = Vec::new();
378 decoder.read_to_end(&mut payload).expect("read spill batch");
379 let mut batch: Vec<FileInfo> =
380 serde_json::from_slice(&payload).expect("decode spilled file batch");
381 files.append(&mut batch);
382 }
383 files
384 }
385}
386
387fn process_file(
388 path: &Path,
389 metadata: &fs::Metadata,
390 progress: &ScanProgress,
391 license_engine: Option<Arc<LicenseDetectionEngine>>,
392 license_options: LicenseScanOptions,
393 text_options: &TextDetectionOptions,
394) -> FileInfo {
395 let mut scan_errors: Vec<String> = vec![];
396 let mut file_info_builder = FileInfoBuilder::default();
397 let license_enabled = license_engine.is_some();
398
399 let started = Instant::now();
400
401 let mut generated_flag = None;
402 let mut is_source_file = false;
403 match extract_information_from_content(
404 &mut file_info_builder,
405 &mut scan_errors,
406 path,
407 progress,
408 license_engine,
409 license_options,
410 text_options,
411 ) {
412 Ok((is_generated, sha256, is_source)) => {
413 generated_flag = is_generated;
414 is_source_file = is_source;
415 let _ = sha256;
416 }
417 Err(e) => scan_errors.push(e.to_string()),
418 };
419
420 maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
421
422 let mut file_info = file_info_builder
423 .name(path.file_name().unwrap().to_string_lossy().to_string())
424 .base_name(
425 path.file_stem()
426 .unwrap_or_default()
427 .to_string_lossy()
428 .to_string(),
429 )
430 .extension(
431 path.extension()
432 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
433 )
434 .path(path.to_string_lossy().to_string())
435 .file_type(FileType::File)
436 .size(metadata.len())
437 .date(
438 text_options
439 .collect_info
440 .then(|| get_creation_date(metadata))
441 .flatten(),
442 )
443 .scan_errors(scan_errors)
444 .build()
445 .expect("FileInformationBuild not completely initialized");
446
447 if text_options.collect_info {
448 file_info.is_source = Some(is_source_file);
449 }
450
451 if file_info.programming_language.as_deref() == Some("Go")
452 && is_go_non_production_source(path).unwrap_or(false)
453 {
454 file_info.is_source = Some(false);
455 }
456
457 if text_options.detect_generated {
458 file_info.is_generated = Some(generated_flag.unwrap_or(false));
459 }
460
461 if file_info.percentage_of_license_text.is_none() && license_enabled {
462 file_info.percentage_of_license_text = Some(0.0);
463 }
464
465 file_info
466}
467
468fn extract_information_from_content(
469 file_info_builder: &mut FileInfoBuilder,
470 scan_errors: &mut Vec<String>,
471 path: &Path,
472 progress: &ScanProgress,
473 license_engine: Option<Arc<LicenseDetectionEngine>>,
474 license_options: LicenseScanOptions,
475 text_options: &TextDetectionOptions,
476) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
477 let started = Instant::now();
478 let filesystem_path = absolute_filesystem_path(path);
479 let buffer = fs::read(&filesystem_path)?;
480 let license_enabled = license_engine.is_some();
481
482 if is_timeout_exceeded(started, text_options.timeout_seconds) {
483 return Err(Error::msg(format!(
484 "Timeout while reading file content (> {:.2}s)",
485 text_options.timeout_seconds
486 )));
487 }
488
489 let sha256 = calculate_sha256(&buffer);
490 let is_generated = text_options
491 .detect_generated
492 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
493 let classification = classify_file_info(&filesystem_path, &buffer);
494
495 if text_options.collect_info {
496 file_info_builder
497 .sha1(Some(calculate_sha1(&buffer)))
498 .md5(Some(calculate_md5(&buffer)))
499 .sha256(Some(sha256))
500 .programming_language(classification.programming_language.clone())
501 .mime_type(Some(classification.mime_type.clone()))
502 .file_type_label(Some(classification.file_type.clone()))
503 .sha1_git(Some(calculate_sha1_git(&buffer)))
504 .is_binary(Some(classification.is_binary))
505 .is_text(Some(classification.is_text))
506 .is_archive(Some(classification.is_archive))
507 .is_media(Some(classification.is_media))
508 .is_source(Some(classification.is_source))
509 .is_script(Some(classification.is_script))
510 .files_count(Some(0))
511 .dirs_count(Some(0))
512 .size_count(Some(0));
513 }
514
515 if should_skip_text_detection(&filesystem_path, &buffer) {
516 return Ok((is_generated, sha256, classification.is_source));
517 }
518
519 if text_options.detect_packages {
522 let started = Instant::now();
523 let parse_result = try_parse_file(&filesystem_path)
524 .or_else(|| {
525 text_options
526 .detect_application_packages
527 .then(|| try_parse_windows_executable_bytes(&filesystem_path, &buffer))
528 .flatten()
529 })
530 .or_else(|| {
531 text_options
532 .detect_packages_in_compiled
533 .then(|| {
534 (classification.is_binary && is_supported_compiled_binary_format(&buffer))
535 .then(|| try_parse_compiled_bytes(&buffer))
536 .flatten()
537 })
538 .flatten()
539 });
540
541 if let Some(parse_result) = parse_result {
542 let packages = parse_result
543 .packages
544 .into_iter()
545 .filter(|package| {
546 let is_compiled_package = package
547 .datasource_id
548 .as_ref()
549 .is_some_and(is_compiled_datasource);
550 let is_system_package = package
551 .datasource_id
552 .as_ref()
553 .is_some_and(is_system_datasource);
554 if is_compiled_package {
555 text_options.detect_packages_in_compiled
556 } else if is_system_package {
557 text_options.detect_system_packages
558 } else {
559 text_options.detect_application_packages
560 }
561 })
562 .collect();
563 file_info_builder.package_data(packages);
564 scan_errors.extend(parse_result.scan_errors);
565 }
566 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
567 }
568
569 if is_timeout_exceeded(started, text_options.timeout_seconds) {
570 return Err(Error::msg(format!(
571 "Timeout while extracting package/text metadata (> {:.2}s)",
572 text_options.timeout_seconds
573 )));
574 }
575
576 let (text_content, text_kind, text_scan_error) =
577 extract_text_for_detection_with_diagnostics(&filesystem_path, &buffer);
578 if let Some(text_scan_error) = text_scan_error {
579 scan_errors.push(text_scan_error);
580 }
581 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
582
583 if is_timeout_exceeded(started, text_options.timeout_seconds) {
584 return Err(Error::msg(format!(
585 "Timeout while extracting text content (> {:.2}s)",
586 text_options.timeout_seconds
587 )));
588 }
589
590 if text_content.is_empty() {
591 return Ok((is_generated, sha256, classification.is_source));
592 }
593
594 if text_options.detect_copyrights {
595 extract_copyright_information(
596 file_info_builder,
597 path,
598 &text_content,
599 text_options.timeout_seconds,
600 from_binary_strings,
601 );
602 }
603 extract_email_url_information(
604 file_info_builder,
605 &text_content,
606 text_options,
607 from_binary_strings,
608 );
609
610 if is_timeout_exceeded(started, text_options.timeout_seconds) {
611 return Err(Error::msg(format!(
612 "Timeout before license scan (> {:.2}s)",
613 text_options.timeout_seconds
614 )));
615 }
616 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
618 if let Some(sourcemap_content) =
619 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
620 {
621 sourcemap_content
622 } else {
623 text_content
624 }
625 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
626 remove_verbatim_escape_sequences(&text_content)
627 } else {
628 text_content
629 };
630 let text_content_for_license_detection =
631 augment_license_detection_text(path, &text_content_for_license_detection);
632 let text_content_for_license_detection = cap_non_source_json_license_text(
633 path,
634 &classification,
635 text_content_for_license_detection.as_ref(),
636 )
637 .into_owned();
638
639 if license_enabled {
640 let started = Instant::now();
641 extract_license_information(
642 file_info_builder,
643 scan_errors,
644 &filesystem_path,
645 text_content_for_license_detection.clone(),
646 license_engine,
647 license_options,
648 from_binary_strings,
649 )?;
650 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
651 } else {
652 extract_license_information(
653 file_info_builder,
654 scan_errors,
655 &filesystem_path,
656 text_content_for_license_detection,
657 license_engine,
658 license_options,
659 from_binary_strings,
660 )?;
661 }
662
663 if is_timeout_exceeded(started, text_options.timeout_seconds) {
664 return Err(Error::msg(format!(
665 "Timeout during license scan (> {:.2}s)",
666 text_options.timeout_seconds
667 )));
668 }
669
670 Ok((is_generated, sha256, classification.is_source))
671}
672
673fn absolute_filesystem_path(path: &Path) -> std::path::PathBuf {
674 if path.is_absolute() {
675 return path.to_path_buf();
676 }
677
678 std::env::current_dir()
679 .map(|cwd| cwd.join(path))
680 .unwrap_or_else(|_| path.to_path_buf())
681}
682
683fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
684 timeout_seconds.is_finite()
685 && timeout_seconds > 0.0
686 && started.elapsed().as_secs_f64() > timeout_seconds
687}
688
689fn maybe_record_processing_timeout(
690 scan_errors: &mut Vec<String>,
691 started: Instant,
692 timeout_seconds: f64,
693) {
694 if is_timeout_exceeded(started, timeout_seconds)
695 && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
696 {
697 scan_errors.push(format!(
698 "Processing interrupted due to timeout after {:.2} seconds",
699 timeout_seconds
700 ));
701 }
702}
703
704fn is_timeout_scan_error(error: &str) -> bool {
705 error.contains("Timeout while ")
706 || error.contains("Timeout before ")
707 || error.contains("Timeout during ")
708 || error.contains("Processing interrupted due to timeout")
709}
710
711fn cap_non_source_json_license_text<'a>(
712 path: &Path,
713 classification: &crate::utils::file::FileInfoClassification,
714 text: &'a str,
715) -> std::borrow::Cow<'a, str> {
716 if classification.is_source
717 || crate::utils::sourcemap::is_sourcemap(path)
718 || !is_json_like_text(classification, path)
719 || text.len() <= LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES
720 {
721 return std::borrow::Cow::Borrowed(text);
722 }
723
724 std::borrow::Cow::Owned(
725 truncate_at_char_boundary(text, LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES).to_string(),
726 )
727}
728
729fn is_json_like_text(
730 classification: &crate::utils::file::FileInfoClassification,
731 path: &Path,
732) -> bool {
733 classification.file_type == "JSON text data"
734 || classification.mime_type == "application/json"
735 || classification.mime_type.ends_with("+json")
736 || path
737 .extension()
738 .and_then(|ext| ext.to_str())
739 .is_some_and(|ext| ext.eq_ignore_ascii_case("json"))
740}
741
742fn truncate_at_char_boundary(text: &str, max_bytes: usize) -> &str {
743 if text.len() <= max_bytes {
744 return text;
745 }
746
747 let mut end = max_bytes;
748 while end > 0 && !text.is_char_boundary(end) {
749 end -= 1;
750 }
751 &text[..end]
752}
753
754fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
755 matches!(
756 datasource_id,
757 DatasourceId::AlpineInstalledDb
758 | DatasourceId::DebianDistrolessInstalledDb
759 | DatasourceId::DebianInstalledFilesList
760 | DatasourceId::DebianInstalledMd5Sums
761 | DatasourceId::DebianInstalledStatusDb
762 | DatasourceId::FreebsdCompactManifest
763 | DatasourceId::RpmInstalledDatabaseBdb
764 | DatasourceId::RpmInstalledDatabaseNdb
765 | DatasourceId::RpmInstalledDatabaseSqlite
766 | DatasourceId::RpmYumdb
767 )
768}
769
770fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
771 matches!(
772 datasource_id,
773 DatasourceId::GoBinary | DatasourceId::RustBinary
774 )
775}
776
777fn extract_copyright_information(
778 file_info_builder: &mut FileInfoBuilder,
779 path: &Path,
780 text_content: &str,
781 timeout_seconds: f64,
782 from_binary_strings: bool,
783) {
784 if copyright::is_credits_file(path) {
786 let author_detections = copyright::detect_credits_authors(text_content);
787 if !author_detections.is_empty() {
788 file_info_builder.authors(
789 author_detections
790 .into_iter()
791 .map(|a| Author {
792 author: a.author,
793 start_line: a.start_line,
794 end_line: a.end_line,
795 })
796 .collect(),
797 );
798 return;
799 }
800 }
801
802 let copyright_options = CopyrightDetectionOptions {
803 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
804 Some(Duration::from_secs_f64(timeout_seconds))
805 } else {
806 None
807 },
808 ..CopyrightDetectionOptions::default()
809 };
810
811 let (copyrights, holders, authors) =
812 copyright::detect_copyrights_with_options(text_content, ©right_options);
813 let (copyrights, holders, authors) = if from_binary_strings {
814 prune_binary_string_detections(text_content, copyrights, holders, authors)
815 } else {
816 (copyrights, holders, authors)
817 };
818
819 file_info_builder.copyrights(
820 copyrights
821 .into_iter()
822 .map(|c| Copyright {
823 copyright: c.copyright,
824 start_line: c.start_line,
825 end_line: c.end_line,
826 })
827 .collect::<Vec<Copyright>>(),
828 );
829 file_info_builder.holders(
830 holders
831 .into_iter()
832 .map(|h| Holder {
833 holder: h.holder,
834 start_line: h.start_line,
835 end_line: h.end_line,
836 })
837 .collect::<Vec<Holder>>(),
838 );
839 file_info_builder.authors(
840 authors
841 .into_iter()
842 .map(|a| Author {
843 author: a.author,
844 start_line: a.start_line,
845 end_line: a.end_line,
846 })
847 .collect::<Vec<Author>>(),
848 );
849}
850
851fn prune_binary_string_detections(
852 text_content: &str,
853 copyrights: Vec<CopyrightDetection>,
854 holders: Vec<HolderDetection>,
855 authors: Vec<AuthorDetection>,
856) -> (
857 Vec<CopyrightDetection>,
858 Vec<HolderDetection>,
859 Vec<AuthorDetection>,
860) {
861 let kept_copyrights: Vec<CopyrightDetection> = copyrights
862 .into_iter()
863 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
864 .collect();
865
866 let kept_holders: Vec<HolderDetection> = holders
867 .into_iter()
868 .filter(|holder| {
869 kept_copyrights.iter().any(|copyright| {
870 ranges_overlap(
871 holder.start_line,
872 holder.end_line,
873 copyright.start_line,
874 copyright.end_line,
875 )
876 })
877 })
878 .collect();
879
880 let kept_authors = authors
881 .into_iter()
882 .filter(|author| is_binary_string_author_candidate(&author.author))
883 .chain(extract_binary_string_author_supplements(text_content))
884 .filter({
885 let mut seen = HashSet::new();
886 move |author| seen.insert(author.author.clone())
887 })
888 .collect();
889
890 (kept_copyrights, kept_holders, kept_authors)
891}
892
893fn ranges_overlap(
894 a_start: LineNumber,
895 a_end: LineNumber,
896 b_start: LineNumber,
897 b_end: LineNumber,
898) -> bool {
899 a_start <= b_end && b_start <= a_end
900}
901
902fn is_binary_string_copyright_candidate(text: &str) -> bool {
903 if contains_year(text) {
904 return true;
905 }
906
907 let trimmed = text.trim();
908 let lower = trimmed.to_ascii_lowercase();
909 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
910 tail.trim()
911 } else {
912 lower.trim()
913 };
914 let original_tail = if lower.starts_with("copyright") {
915 trimmed["copyright".len()..].trim()
916 } else {
917 trimmed
918 };
919
920 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
921 return false;
922 }
923
924 let alpha_tokens: Vec<&str> = tail
925 .split_whitespace()
926 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
927 .collect();
928
929 if alpha_tokens.len() <= 1 {
930 return has_explicit_copyright_marker(text)
931 && alpha_tokens.iter().any(|token| {
932 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
933 });
934 }
935
936 if !has_explicit_copyright_marker(text) {
937 return false;
938 }
939
940 has_binary_name_like_shape(original_tail)
941}
942
943fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
944 let mut authors = Vec::new();
945
946 for (line_index, line) in text_content.lines().enumerate() {
947 if let Some(author) = extract_named_author_from_binary_line(line) {
948 authors.push(AuthorDetection {
949 author,
950 start_line: LineNumber::from_0_indexed(line_index),
951 end_line: LineNumber::from_0_indexed(line_index),
952 });
953 }
954 }
955
956 authors
957}
958
959fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
960 let line = line.trim();
961 if line.is_empty() {
962 return None;
963 }
964
965 let emails = finder::find_emails(
966 line,
967 &DetectionConfig {
968 max_emails: 4,
969 max_urls: 0,
970 unique: false,
971 },
972 );
973 let email = emails.first()?.email.as_str();
974 if !is_binary_string_email_candidate(email) {
975 return None;
976 }
977
978 let lower_line = line.to_ascii_lowercase();
979 let email_start = lower_line.find(email)?;
980 let raw_prefix = &line[..email_start];
981 let has_author_marker = contains_binary_author_marker(raw_prefix);
982 let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
983 let prefix = prefix
984 .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
985 .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
986 .trim();
987
988 let (name, _) = split_name_email(prefix);
989 let name = name.or_else(|| {
990 let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
991 (!trimmed.is_empty()).then(|| trimmed.to_string())
992 });
993
994 let Some(name) = name.map(|name| name.trim().to_string()) else {
995 if has_author_marker {
996 return Some(email.to_string());
997 }
998 return None;
999 };
1000
1001 if name.is_empty() && has_author_marker {
1002 return Some(email.to_string());
1003 }
1004
1005 if !has_binary_name_like_shape(&name) {
1006 return None;
1007 }
1008
1009 if line.contains(&format!("<{email}>")) {
1010 Some(format!("{name} <{email}>"))
1011 } else if line.contains(&format!("({email})")) {
1012 Some(format!("{name} ({email})"))
1013 } else {
1014 Some(format!("{name} {email}"))
1015 }
1016}
1017
1018fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
1019 let lower = text.to_ascii_lowercase();
1020 let idx = lower.rfind(marker)?;
1021 Some(text[idx + marker.len()..].trim())
1022}
1023
1024fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
1025 const MARKERS: &[&str] = &[
1026 " patch author: ",
1027 " patch author ",
1028 " written by ",
1029 " contributed by ",
1030 " original work done by ",
1031 " work done by ",
1032 " thanks to ",
1033 " review by ",
1034 " by ",
1035 " from ",
1036 ];
1037
1038 MARKERS
1039 .iter()
1040 .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
1041 .next()
1042}
1043
1044fn contains_binary_author_marker(text: &str) -> bool {
1045 take_suffix_after_last_author_marker(text).is_some()
1046}
1047
1048fn has_binary_name_like_shape(text: &str) -> bool {
1049 let trimmed = text.trim();
1050 if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
1051 {
1052 return false;
1053 }
1054
1055 let tokens: Vec<&str> = trimmed
1056 .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
1057 .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
1058 .collect();
1059 if tokens.is_empty() {
1060 return false;
1061 }
1062
1063 let uppercase_like = tokens
1064 .iter()
1065 .filter(|token| {
1066 let token = token.trim_matches('.');
1067 token
1068 .chars()
1069 .find(|c| c.is_ascii_alphabetic())
1070 .is_some_and(|c| c.is_ascii_uppercase())
1071 })
1072 .count();
1073
1074 uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
1075 || tokens
1076 .iter()
1077 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
1078}
1079
1080fn has_sufficient_alphabetic_content(text: &str) -> bool {
1081 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
1082 if alnum_count == 0 {
1083 return false;
1084 }
1085
1086 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1087 alpha_count * 2 >= alnum_count
1088}
1089
1090fn has_excessive_at_noise(text: &str) -> bool {
1091 text.chars().filter(|c| *c == '@').count() >= 3
1092}
1093
1094fn has_explicit_copyright_marker(text: &str) -> bool {
1095 let lower = text.to_ascii_lowercase();
1096 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1097}
1098
1099fn contains_year(text: &str) -> bool {
1100 let bytes = text.as_bytes();
1101 bytes.windows(4).any(|window| {
1102 window.iter().all(|b| b.is_ascii_digit())
1103 && matches!(window[0], b'1' | b'2')
1104 && matches!(window[1], b'9' | b'0')
1105 })
1106}
1107
1108fn is_company_like_suffix(token: &str) -> bool {
1109 matches!(
1110 token.to_ascii_lowercase().as_str(),
1111 "inc"
1112 | "corp"
1113 | "corporation"
1114 | "co"
1115 | "company"
1116 | "ltd"
1117 | "llc"
1118 | "gmbh"
1119 | "foundation"
1120 | "project"
1121 | "systems"
1122 | "software"
1123 | "technologies"
1124 | "technology"
1125 )
1126}
1127
1128fn extract_email_url_information(
1129 file_info_builder: &mut FileInfoBuilder,
1130 text_content: &str,
1131 text_options: &TextDetectionOptions,
1132 from_binary_strings: bool,
1133) {
1134 if !text_options.detect_emails && !text_options.detect_urls {
1135 return;
1136 }
1137
1138 if text_options.detect_emails {
1139 let config = DetectionConfig {
1140 max_emails: text_options.max_emails,
1141 max_urls: text_options.max_urls,
1142 unique: from_binary_strings,
1143 };
1144 let emails = finder::find_emails(text_content, &config)
1145 .into_iter()
1146 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1147 .map(|d| OutputEmail {
1148 email: d.email,
1149 start_line: d.start_line,
1150 end_line: d.end_line,
1151 })
1152 .collect::<Vec<_>>();
1153 file_info_builder.emails(emails);
1154 }
1155
1156 if text_options.detect_urls {
1157 let config = DetectionConfig {
1158 max_emails: text_options.max_emails,
1159 max_urls: if from_binary_strings {
1160 0
1161 } else {
1162 text_options.max_urls
1163 },
1164 unique: !from_binary_strings,
1165 };
1166 let mut urls = finder::find_urls(text_content, &config)
1167 .into_iter()
1168 .filter_map(|d| {
1169 let url = if from_binary_strings {
1170 normalize_binary_string_url(&d.url)?
1171 } else {
1172 d.url
1173 };
1174 Some(OutputURL {
1175 url,
1176 start_line: d.start_line,
1177 end_line: d.end_line,
1178 })
1179 })
1180 .collect::<Vec<_>>();
1181 if from_binary_strings {
1182 let mut seen = HashSet::new();
1183 urls.retain(|url| seen.insert(url.url.clone()));
1184 if text_options.max_urls > 0 && urls.len() > text_options.max_urls {
1185 urls.truncate(text_options.max_urls);
1186 }
1187 }
1188 file_info_builder.urls(urls);
1189 }
1190}
1191
1192fn is_binary_string_email_candidate(email: &str) -> bool {
1193 let Some((local, domain)) = email.rsplit_once('@') else {
1194 return false;
1195 };
1196
1197 if !has_strong_binary_local_part(local) {
1198 return false;
1199 }
1200
1201 has_strong_binary_host_shape(domain)
1202}
1203
1204fn is_binary_string_url_candidate(url: &str) -> bool {
1205 let parsed = url::Url::parse(url).ok();
1206 let Some(parsed) = parsed else {
1207 return false;
1208 };
1209 let Some(host) = parsed.host_str() else {
1210 return false;
1211 };
1212
1213 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1214}
1215
1216fn normalize_binary_string_url(url: &str) -> Option<String> {
1217 let mut parsed = url::Url::parse(url).ok()?;
1218
1219 if let Some(host) = parsed.host_str() {
1220 let normalized_host = normalize_binary_url_host(host);
1221 if normalized_host != host {
1222 parsed.set_host(Some(&normalized_host)).ok()?;
1223 }
1224 }
1225
1226 let normalized_path = normalize_binary_url_path(parsed.path());
1227 if normalized_path != parsed.path() {
1228 parsed.set_path(&normalized_path);
1229 }
1230
1231 let normalized = parsed.to_string();
1232 is_binary_string_url_candidate(&normalized).then_some(normalized)
1233}
1234
1235fn normalize_binary_url_host(host: &str) -> String {
1236 let mut labels = host.split('.').map(ToOwned::to_owned).collect::<Vec<_>>();
1237 if let Some(last_label) = labels.last_mut() {
1238 *last_label = trim_binary_tld_tail(last_label);
1239 }
1240 labels.join(".")
1241}
1242
1243fn trim_binary_tld_tail(label: &str) -> String {
1244 const KNOWN_TLDS: &[&str] = &["com", "org", "net", "edu", "gov", "mil", "io", "dev"];
1245 for tld in KNOWN_TLDS {
1246 let Some(suffix) = label.get(tld.len()..) else {
1247 continue;
1248 };
1249 if label.len() > tld.len()
1250 && label[..tld.len()].eq_ignore_ascii_case(tld)
1251 && suffix.starts_with(|ch: char| ch.is_ascii_digit())
1252 && suffix.len() <= 3
1253 && suffix
1254 .chars()
1255 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '!' | '$'))
1256 {
1257 return (*tld).to_string();
1258 }
1259 }
1260 label.to_string()
1261}
1262
1263fn normalize_binary_url_path(path: &str) -> String {
1264 let mut chars = path.chars().rev();
1265 let Some(last) = chars.next() else {
1266 return path.to_string();
1267 };
1268 let Some(prev) = chars.next() else {
1269 return path.to_string();
1270 };
1271 if matches!(last, '_' | '!' | '$') && prev.is_ascii_digit() {
1272 path[..path.len() - last.len_utf8()].to_string()
1273 } else {
1274 path.to_string()
1275 }
1276}
1277
1278fn is_binary_string_author_candidate(author: &str) -> bool {
1279 let trimmed = author.trim();
1280 if trimmed.is_empty()
1281 || !has_sufficient_alphabetic_content(trimmed)
1282 || has_excessive_at_noise(trimmed)
1283 {
1284 return false;
1285 }
1286
1287 if trimmed.contains('@') {
1288 let emails = finder::find_emails(
1289 trimmed,
1290 &DetectionConfig {
1291 max_emails: 4,
1292 max_urls: 0,
1293 unique: true,
1294 },
1295 );
1296 if emails.len() > 1 {
1297 return false;
1298 }
1299
1300 if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1301 return !extracted.is_empty();
1302 }
1303
1304 let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1305 return false;
1306 };
1307 if !is_binary_string_email_candidate(email) {
1308 return false;
1309 }
1310
1311 let (name, _) = split_name_email(trimmed);
1312 return name.as_deref().is_some_and(has_binary_name_like_shape);
1313 }
1314
1315 has_binary_name_like_shape(trimmed)
1316}
1317
1318fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1319 if parsed.path() != "/"
1320 && parsed
1321 .path()
1322 .split('/')
1323 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1324 {
1325 return true;
1326 }
1327
1328 if parsed.query().is_some() || parsed.fragment().is_some() {
1329 return true;
1330 }
1331
1332 let Some(host) = parsed.host_str() else {
1333 return false;
1334 };
1335
1336 let labels: Vec<&str> = host.split('.').collect();
1337 if labels.len() > 2 {
1338 return labels[..labels.len() - 1].iter().any(|label| {
1339 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1340 });
1341 }
1342
1343 if matches!(labels.first(), Some(&"www")) {
1344 return true;
1345 }
1346
1347 if labels.len() == 2 {
1348 let domain = labels[0];
1349 let tld = labels[1];
1350 if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1351 return true;
1352 }
1353 }
1354
1355 labels
1356 .iter()
1357 .take(labels.len().saturating_sub(1))
1358 .any(|label| {
1359 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1360 })
1361}
1362
1363fn has_strong_binary_local_part(local: &str) -> bool {
1364 local
1365 .split(|c: char| !c.is_ascii_alphabetic())
1366 .any(|segment| segment.len() >= 3)
1367}
1368
1369fn has_strong_binary_host_shape(host: &str) -> bool {
1370 let labels: Vec<&str> = host.split('.').collect();
1371 if labels.len() < 2 {
1372 return false;
1373 }
1374
1375 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1376 &labels[1..]
1377 } else {
1378 &labels[..]
1379 };
1380
1381 if relevant.len() < 2 {
1382 return false;
1383 }
1384
1385 relevant[..relevant.len() - 1].iter().any(|label| {
1386 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1387 })
1388}
1389
1390fn extract_license_information(
1391 file_info_builder: &mut FileInfoBuilder,
1392 scan_errors: &mut Vec<String>,
1393 path: &Path,
1394 text_content: String,
1395 license_engine: Option<Arc<LicenseDetectionEngine>>,
1396 license_options: LicenseScanOptions,
1397 from_binary_strings: bool,
1398) -> Result<(), Error> {
1399 let Some(engine) = license_engine else {
1400 return Ok(());
1401 };
1402
1403 let detection_result = if license_options.min_score == 0 {
1404 engine.detect_with_kind_and_source(
1405 &text_content,
1406 license_options.unknown_licenses,
1407 from_binary_strings,
1408 &path.to_string_lossy(),
1409 )
1410 } else {
1411 engine.detect_with_kind_and_source_with_score(
1412 &text_content,
1413 license_options.unknown_licenses,
1414 from_binary_strings,
1415 &path.to_string_lossy(),
1416 f32::from(license_options.min_score),
1417 )
1418 };
1419
1420 match detection_result {
1421 Ok(detections) => {
1422 let query =
1423 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1424 let mut model_detections = Vec::new();
1425 let mut model_clues = Vec::new();
1426
1427 for detection in &detections {
1428 let (public_detection, clue_matches) = convert_detection_to_model(
1429 detection,
1430 license_options,
1431 &text_content,
1432 query.as_ref(),
1433 );
1434
1435 if let Some(public_detection) = public_detection {
1436 model_detections.push(public_detection);
1437 }
1438
1439 model_clues.extend(clue_matches);
1440 }
1441
1442 if !model_detections.is_empty() {
1443 let expressions: Vec<String> = model_detections
1444 .iter()
1445 .filter(|d| !d.license_expression_spdx.is_empty())
1446 .map(|d| d.license_expression_spdx.clone())
1447 .collect();
1448
1449 if !expressions.is_empty() {
1450 let combined = crate::utils::spdx::combine_license_expressions(expressions);
1451 if let Some(expr) = combined {
1452 file_info_builder.license_expression(Some(expr));
1453 }
1454 }
1455 }
1456
1457 file_info_builder.license_detections(model_detections);
1458 file_info_builder.license_clues(model_clues);
1459 file_info_builder.percentage_of_license_text(
1460 query
1461 .as_ref()
1462 .map(|query| compute_percentage_of_license_text(query, &detections)),
1463 );
1464 }
1465 Err(e) => {
1466 scan_errors.push(format!("License detection failed: {}", e));
1467 }
1468 }
1469
1470 Ok(())
1471}
1472
1473fn convert_detection_to_model(
1474 detection: &crate::license_detection::LicenseDetection,
1475 license_options: LicenseScanOptions,
1476 text_content: &str,
1477 query: Option<&Query<'_>>,
1478) -> (Option<LicenseDetection>, Vec<Match>) {
1479 let matches: Vec<Match> = detection
1480 .matches
1481 .iter()
1482 .map(|m| convert_match_to_model(m, license_options, text_content, query))
1483 .collect();
1484
1485 if let Some(license_expression) = detection.license_expression.clone() {
1486 (
1487 Some(LicenseDetection {
1488 license_expression,
1489 license_expression_spdx: detection
1490 .license_expression_spdx
1491 .clone()
1492 .unwrap_or_default(),
1493 matches,
1494 detection_log: if license_options.include_diagnostics {
1495 detection.detection_log.clone()
1496 } else {
1497 Vec::new()
1498 },
1499 identifier: detection.identifier.clone(),
1500 }),
1501 Vec::new(),
1502 )
1503 } else {
1504 (None, matches)
1505 }
1506}
1507
1508fn convert_match_to_model(
1509 m: &crate::license_detection::models::LicenseMatch,
1510 license_options: LicenseScanOptions,
1511 text_content: &str,
1512 query: Option<&Query<'_>>,
1513) -> Match {
1514 let rule_url = if m.rule_url.is_empty() {
1515 None
1516 } else {
1517 Some(m.rule_url.clone())
1518 };
1519 let matched_text = if license_options.include_text {
1520 m.matched_text.clone().or_else(|| {
1521 Some(crate::license_detection::query::matched_text_from_text(
1522 text_content,
1523 m.start_line.get(),
1524 m.end_line.get(),
1525 ))
1526 })
1527 } else {
1528 None
1529 };
1530 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1531 query.map(|query| matched_text_diagnostics_from_match(query, m))
1532 } else {
1533 None
1534 };
1535 Match {
1536 license_expression: m.license_expression.clone(),
1537 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1538 from_file: m.from_file.clone(),
1539 start_line: m.start_line,
1540 end_line: m.end_line,
1541 matcher: Some(m.matcher.to_string()),
1542 score: m.score,
1543 matched_length: Some(m.matched_length),
1544 match_coverage: Some((f64::from(m.coverage()) * 100.0).round() / 100.0),
1545 rule_relevance: Some(m.rule_relevance),
1546 rule_identifier: Some(m.rule_identifier.clone()),
1547 rule_url,
1548 matched_text,
1549 referenced_filenames: m.referenced_filenames.clone(),
1550 matched_text_diagnostics,
1551 }
1552}
1553
1554fn compute_percentage_of_license_text(
1555 query: &Query<'_>,
1556 detections: &[crate::license_detection::LicenseDetection],
1557) -> f64 {
1558 let matched_positions: std::collections::HashSet<usize> = detections
1559 .iter()
1560 .flat_map(|detection| detection.matches.iter())
1561 .flat_map(|m| m.query_span().iter())
1562 .collect();
1563
1564 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1565 if query_tokens_length == 0 {
1566 return 0.0;
1567 }
1568
1569 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1570 (percentage * 100.0).round() / 100.0
1571}
1572
1573fn matched_text_diagnostics_from_match(
1574 query: &Query<'_>,
1575 license_match: &InternalLicenseMatch,
1576) -> String {
1577 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1578 let Some(start_pos) = matched_positions.iter().min() else {
1579 return crate::license_detection::query::matched_text_from_text(
1580 &query.text,
1581 license_match.start_line.get(),
1582 license_match.end_line.get(),
1583 );
1584 };
1585 let Some(end_pos) = matched_positions.iter().max() else {
1586 return crate::license_detection::query::matched_text_from_text(
1587 &query.text,
1588 license_match.start_line.get(),
1589 license_match.end_line.get(),
1590 );
1591 };
1592
1593 crate::license_detection::query::matched_text_diagnostics_from_text(
1594 &query.text,
1595 query,
1596 &matched_positions,
1597 start_pos,
1598 end_pos,
1599 license_match.start_line.get(),
1600 license_match.end_line.get(),
1601 )
1602}
1603
1604fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1605 is_pem_certificate_file(path, buffer)
1606}
1607
1608fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1609 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1610 return Ok(false);
1611 }
1612
1613 if path
1614 .file_name()
1615 .and_then(|name| name.to_str())
1616 .is_some_and(|name| name.ends_with("_test.go"))
1617 {
1618 return Ok(true);
1619 }
1620
1621 let content = fs::read_to_string(path)?;
1622 Ok(content.lines().take(10).any(|line| {
1623 let trimmed = line.trim();
1624 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1625 && trimmed.split_whitespace().any(|token| token == "test")
1626 }))
1627}
1628
1629fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1630 let prefix_len = buffer.len().min(8192);
1631 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1632 let trimmed_lines: Vec<&str> = prefix
1633 .lines()
1634 .map(str::trim)
1635 .filter(|line| !line.is_empty())
1636 .take(64)
1637 .collect();
1638
1639 let Some(first_line) = trimmed_lines.first().copied() else {
1640 return false;
1641 };
1642
1643 PEM_CERTIFICATE_HEADERS
1644 .iter()
1645 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1646}
1647
1648fn process_directory(
1649 path: &Path,
1650 _metadata: &fs::Metadata,
1651 collect_info: bool,
1652 license_enabled: bool,
1653) -> FileInfo {
1654 let name = path
1655 .file_name()
1656 .unwrap_or_default()
1657 .to_string_lossy()
1658 .to_string();
1659 let base_name = name.clone(); FileInfo {
1662 name,
1663 base_name,
1664 extension: "".to_string(),
1665 path: path.to_string_lossy().to_string(),
1666 file_type: FileType::Directory,
1667 mime_type: None,
1668 file_type_label: None,
1669 size: 0,
1670 date: None,
1671 sha1: None,
1672 md5: None,
1673 sha256: None,
1674 sha1_git: None,
1675 programming_language: None,
1676 package_data: Vec::new(),
1677 license_expression: None,
1678 license_detections: Vec::new(),
1679 license_clues: Vec::new(),
1680 percentage_of_license_text: license_enabled.then_some(0.0),
1681 copyrights: Vec::new(),
1682 holders: Vec::new(),
1683 authors: Vec::new(),
1684 emails: Vec::new(),
1685 urls: Vec::new(),
1686 for_packages: Vec::new(),
1687 scan_errors: Vec::new(),
1688 license_policy: None,
1689 is_binary: collect_info.then_some(false),
1690 is_text: collect_info.then_some(false),
1691 is_archive: collect_info.then_some(false),
1692 is_media: collect_info.then_some(false),
1693 is_source: collect_info.then_some(false),
1694 is_script: collect_info.then_some(false),
1695 files_count: collect_info.then_some(0),
1696 dirs_count: collect_info.then_some(0),
1697 size_count: collect_info.then_some(0),
1698 source_count: None,
1699 is_legal: false,
1700 is_manifest: false,
1701 is_readme: false,
1702 is_top_level: false,
1703 is_key_file: false,
1704 is_community: false,
1705 is_generated: None,
1706 facets: vec![],
1707 tallies: None,
1708 }
1709}
1710
1711#[cfg(test)]
1712mod tests {
1713 use super::{
1714 LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES, cap_non_source_json_license_text,
1715 compute_percentage_of_license_text, convert_detection_to_model,
1716 extract_email_url_information, extract_named_author_from_binary_line,
1717 is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1718 is_binary_string_email_candidate, is_binary_string_url_candidate,
1719 is_go_non_production_source, normalize_binary_string_url, process_file,
1720 };
1721 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1722 use crate::license_detection::index::LicenseIndex;
1723 use crate::license_detection::index::dictionary::TokenDictionary;
1724 use crate::license_detection::models::position_span::PositionSpan;
1725 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1726 use crate::license_detection::query::Query;
1727 use crate::models::{FileInfoBuilder, FileType, MatchScore};
1728 use crate::progress::{ProgressMode, ScanProgress};
1729 use crate::scanner::scan_options_fingerprint;
1730 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1731 use crate::utils::file::FileInfoClassification;
1732 use std::fs;
1733 use std::path::Path;
1734 use std::time::{Duration, Instant};
1735 use tempfile::tempdir;
1736
1737 use super::maybe_record_processing_timeout;
1738
1739 use crate::models::LineNumber;
1740
1741 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1742 LicenseMatch {
1743 rid: 0,
1744 license_expression: "mit".to_string(),
1745 license_expression_spdx: Some("MIT".to_string()),
1746 from_file: None,
1747 start_line: LineNumber::ONE,
1748 end_line: LineNumber::ONE,
1749 start_token: 0,
1750 end_token: 1,
1751 matcher: MatcherKind::Hash,
1752 score: MatchScore::from_percentage(1.0),
1753 matched_length: 3,
1754 rule_length: 3,
1755 match_coverage: 100.0,
1756 rule_relevance: 100,
1757 rule_identifier: "mit.LICENSE".to_string(),
1758 rule_url: rule_url.to_string(),
1759 matched_text: Some("MIT".to_string()),
1760 referenced_filenames: None,
1761 rule_kind: RuleKind::Text,
1762 is_from_license: true,
1763 rule_start_token: 0,
1764 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1765 candidate_resemblance: 0.0,
1766 candidate_containment: 0.0,
1767 }
1768 }
1769
1770 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1771 InternalLicenseDetection {
1772 license_expression: Some("mit".to_string()),
1773 license_expression_spdx: Some("MIT".to_string()),
1774 matches: vec![make_internal_match(rule_url)],
1775 detection_log: vec![],
1776 identifier: Some("mit-test".to_string()),
1777 file_regions: Vec::new(),
1778 }
1779 }
1780
1781 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1782 let dictionary = TokenDictionary::new_with_legalese(entries);
1783 let mut index = LicenseIndex::new(dictionary);
1784 index.len_legalese = len_legalese;
1785 index
1786 }
1787
1788 #[test]
1789 fn test_convert_detection_to_model_preserves_rule_url() {
1790 let detection = make_detection(
1791 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1792 );
1793
1794 let (converted, clues) =
1795 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1796 let converted = converted.expect("detection should convert");
1797
1798 assert_eq!(
1799 converted.matches[0].rule_url.as_deref(),
1800 Some(
1801 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1802 )
1803 );
1804 assert!(clues.is_empty());
1805 }
1806
1807 #[test]
1808 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1809 let detection = make_detection("");
1810
1811 let (converted, clues) =
1812 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1813 let converted = converted.expect("detection should convert");
1814
1815 assert_eq!(converted.matches[0].rule_url, None);
1816 assert!(clues.is_empty());
1817 }
1818
1819 #[test]
1820 fn test_convert_detection_to_model_rounds_match_coverage() {
1821 let mut detection = make_detection("");
1822 detection.matches[0].score = MatchScore::from_percentage(81.82);
1823 detection.matches[0].match_coverage = 33.334;
1824
1825 let (converted, clues) =
1826 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1827 let converted = converted.expect("detection should convert");
1828
1829 assert_eq!(
1830 converted.matches[0].score,
1831 MatchScore::from_percentage(81.82)
1832 );
1833 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1834 assert!(clues.is_empty());
1835 }
1836
1837 #[test]
1838 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1839 let mut detection = make_detection(
1840 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1841 );
1842 detection.license_expression = None;
1843 detection.license_expression_spdx = None;
1844 detection.identifier = None;
1845 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1846 detection.matches[0].license_expression_spdx =
1847 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1848 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1849 detection.matches[0].rule_kind = RuleKind::Clue;
1850
1851 let (converted, clues) = convert_detection_to_model(
1852 &detection,
1853 LicenseScanOptions {
1854 include_text: true,
1855 min_score: 0,
1856 ..LicenseScanOptions::default()
1857 },
1858 "clue text",
1859 None,
1860 );
1861
1862 assert!(converted.is_none());
1863 assert_eq!(clues.len(), 1);
1864 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1865 assert_eq!(
1866 clues[0].license_expression_spdx,
1867 "LicenseRef-scancode-unknown-license-reference"
1868 );
1869 assert_eq!(
1870 clues[0].rule_identifier.as_deref(),
1871 Some("license-clue_1.RULE")
1872 );
1873 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1874 assert_eq!(clues[0].matched_text_diagnostics, None);
1875 }
1876
1877 #[test]
1878 fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1879 let dir = tempdir().expect("tempdir");
1880 let path = dir.path().join("broken.pdf");
1881 fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1882 .expect("write malformed pdf");
1883 let metadata = fs::metadata(&path).expect("metadata");
1884 let progress = ScanProgress::new(ProgressMode::Quiet);
1885
1886 let file_info = process_file(
1887 &path,
1888 &metadata,
1889 &progress,
1890 None,
1891 LicenseScanOptions::default(),
1892 &TextDetectionOptions::default(),
1893 );
1894
1895 assert!(file_info.scan_errors.is_empty());
1896 }
1897
1898 #[test]
1899 fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1900 let started = Instant::now() - Duration::from_secs(2);
1901 let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1902
1903 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1904
1905 assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1906 }
1907
1908 #[test]
1909 fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1910 let started = Instant::now() - Duration::from_secs(2);
1911 let mut scan_errors = Vec::new();
1912
1913 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1914
1915 assert_eq!(
1916 scan_errors,
1917 vec!["Processing interrupted due to timeout after 1.00 seconds"]
1918 );
1919 }
1920
1921 #[test]
1922 fn test_cap_non_source_json_license_text_truncates_large_json() {
1923 let classification = FileInfoClassification {
1924 mime_type: "application/json".to_string(),
1925 file_type: "JSON text data".to_string(),
1926 programming_language: None,
1927 is_binary: false,
1928 is_text: true,
1929 is_archive: false,
1930 is_media: false,
1931 is_source: false,
1932 is_script: false,
1933 };
1934 let large_json = format!("{{\"items\":\"{}\"}}", "x".repeat(200_000));
1935
1936 let capped = cap_non_source_json_license_text(
1937 Path::new("resolution.json"),
1938 &classification,
1939 &large_json,
1940 );
1941
1942 assert!(capped.len() <= LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES);
1943 assert!(capped.len() < large_json.len());
1944 }
1945
1946 #[test]
1947 fn test_cap_non_source_json_license_text_keeps_sourcemaps_intact() {
1948 let classification = FileInfoClassification {
1949 mime_type: "application/json".to_string(),
1950 file_type: "JSON text data".to_string(),
1951 programming_language: None,
1952 is_binary: false,
1953 is_text: true,
1954 is_archive: false,
1955 is_media: false,
1956 is_source: false,
1957 is_script: false,
1958 };
1959 let large_json = format!("{{\"mappings\":\"{}\"}}", "x".repeat(200_000));
1960
1961 let capped = cap_non_source_json_license_text(
1962 Path::new("bundle.js.map"),
1963 &classification,
1964 &large_json,
1965 );
1966
1967 assert_eq!(capped.as_ref(), large_json);
1968 }
1969
1970 #[test]
1971 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1972 let text = concat!(
1973 "Reproduction and distribution of this file, with or without modification, are\n",
1974 "permitted in any medium without royalties provided the copyright notice\n",
1975 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1976 );
1977 let index = create_test_index(
1978 &[
1979 ("reproduction", 0),
1980 ("distribution", 1),
1981 ("file", 2),
1982 ("without", 3),
1983 ("modification", 4),
1984 ("permitted", 5),
1985 ("medium", 6),
1986 ("royalties", 7),
1987 ("provided", 8),
1988 ("copyright", 9),
1989 ("notice", 10),
1990 ("preserved", 11),
1991 ("offered", 12),
1992 ("warranties", 13),
1993 ],
1994 14,
1995 );
1996 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1997 let mut detection = make_detection(
1998 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1999 );
2000 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
2001 detection.matches[0].license_expression = "fsf-ap".to_string();
2002 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
2003 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
2004 detection.matches[0].matched_text = None;
2005 detection.matches[0].start_line = LineNumber::ONE;
2006 detection.matches[0].end_line = LineNumber::new(3).unwrap();
2007 detection.matches[0].start_token = 0;
2008 detection.matches[0].end_token = query.tokens.len();
2009 detection.matches[0].coordinates =
2010 MatchCoordinates::query_region(PositionSpan::from_positions(
2011 query
2012 .tokens
2013 .iter()
2014 .enumerate()
2015 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
2016 .collect::<Vec<_>>(),
2017 ));
2018 detection.identifier = Some("fsf_ap-test".to_string());
2019
2020 let (converted, clues) = convert_detection_to_model(
2021 &detection,
2022 LicenseScanOptions {
2023 include_text: true,
2024 include_text_diagnostics: true,
2025 include_diagnostics: true,
2026 unknown_licenses: false,
2027 min_score: 0,
2028 },
2029 text,
2030 Some(&query),
2031 );
2032 let converted = converted.expect("detection should convert");
2033
2034 assert!(clues.is_empty());
2035 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
2036 assert_eq!(
2037 converted.matches[0].matched_text.as_deref(),
2038 Some(text.trim_end())
2039 );
2040 let diagnostics = converted.matches[0]
2041 .matched_text_diagnostics
2042 .as_deref()
2043 .expect("diagnostics should be present");
2044 assert!(diagnostics.contains('['));
2045 assert!(diagnostics.contains(']'));
2046 assert_ne!(diagnostics, text.trim_end());
2047 }
2048
2049 #[test]
2050 fn test_extract_email_url_information_skips_binary_string_text() {
2051 let mut builder = FileInfoBuilder::default();
2052 let options = TextDetectionOptions {
2053 collect_info: false,
2054 detect_packages: false,
2055 detect_application_packages: false,
2056 detect_system_packages: false,
2057 detect_packages_in_compiled: false,
2058 detect_copyrights: false,
2059 detect_generated: false,
2060 detect_emails: true,
2061 detect_urls: true,
2062 max_emails: 50,
2063 max_urls: 50,
2064 timeout_seconds: 120.0,
2065 };
2066
2067 extract_email_url_information(
2068 &mut builder,
2069 "contact 6h@fo.lwft and visit http://gmail.com/",
2070 &options,
2071 true,
2072 );
2073
2074 let file = builder
2075 .name("binary.bin".to_string())
2076 .base_name("binary".to_string())
2077 .extension(".bin".to_string())
2078 .path("binary.bin".to_string())
2079 .file_type(FileType::File)
2080 .size(1)
2081 .build()
2082 .expect("builder should produce file info");
2083
2084 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
2085 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
2086 }
2087
2088 #[test]
2089 fn test_extract_email_url_information_keeps_good_binary_contacts() {
2090 let mut builder = FileInfoBuilder::default();
2091 let options = TextDetectionOptions {
2092 collect_info: false,
2093 detect_packages: false,
2094 detect_application_packages: false,
2095 detect_system_packages: false,
2096 detect_packages_in_compiled: false,
2097 detect_copyrights: false,
2098 detect_generated: false,
2099 detect_emails: true,
2100 detect_urls: true,
2101 max_emails: 50,
2102 max_urls: 50,
2103 timeout_seconds: 120.0,
2104 };
2105
2106 extract_email_url_information(
2107 &mut builder,
2108 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
2109 &options,
2110 true,
2111 );
2112
2113 let file = builder
2114 .name("binary.bin".to_string())
2115 .base_name("binary".to_string())
2116 .extension(".bin".to_string())
2117 .path("binary.bin".to_string())
2118 .file_type(FileType::File)
2119 .size(1)
2120 .build()
2121 .expect("builder should produce file info");
2122
2123 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
2124 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
2125 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
2126 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
2127 }
2128
2129 #[test]
2130 fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
2131 let mut builder = FileInfoBuilder::default();
2132 let options = TextDetectionOptions {
2133 collect_info: false,
2134 detect_packages: false,
2135 detect_application_packages: false,
2136 detect_system_packages: false,
2137 detect_packages_in_compiled: false,
2138 detect_copyrights: false,
2139 detect_generated: false,
2140 detect_emails: true,
2141 detect_urls: false,
2142 max_emails: 2,
2143 max_urls: 50,
2144 timeout_seconds: 120.0,
2145 };
2146
2147 extract_email_url_information(
2148 &mut builder,
2149 "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
2150 &options,
2151 true,
2152 );
2153
2154 let file = builder
2155 .name("binary.bin".to_string())
2156 .base_name("binary".to_string())
2157 .extension(".bin".to_string())
2158 .path("binary.bin".to_string())
2159 .file_type(FileType::File)
2160 .size(1)
2161 .build()
2162 .expect("builder should produce file info");
2163
2164 assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
2165 assert_eq!(file.emails[0].email, "jakub@redhat.com");
2166 assert_eq!(file.emails[1].email, "contyk@redhat.com");
2167 }
2168
2169 #[test]
2170 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
2171 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
2172 assert!(!is_binary_string_copyright_candidate(gibberish));
2173 }
2174
2175 #[test]
2176 fn test_binary_string_copyright_candidate_keeps_real_notice() {
2177 let notice = "Copyright nexB and others (c) 2012";
2178 assert!(is_binary_string_copyright_candidate(notice));
2179 }
2180
2181 #[test]
2182 fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
2183 assert!(!is_binary_string_copyright_candidate(
2184 "Copyright - split out libs"
2185 ));
2186 }
2187
2188 #[test]
2189 fn test_binary_string_email_candidate_rejects_gibberish() {
2190 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
2191 }
2192
2193 #[test]
2194 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
2195 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
2196 }
2197
2198 #[test]
2199 fn test_binary_string_url_candidate_rejects_short_fake_host() {
2200 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
2201 }
2202
2203 #[test]
2204 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
2205 assert!(is_binary_string_url_candidate(
2206 "https://www.gnu.org/software/coreutils/"
2207 ));
2208 }
2209
2210 #[test]
2211 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
2212 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
2213 }
2214
2215 #[test]
2216 fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2217 assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2218 }
2219
2220 #[test]
2221 fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2222 assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2223 }
2224
2225 #[test]
2226 fn test_binary_string_url_candidate_keeps_short_project_path() {
2227 assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2228 }
2229
2230 #[test]
2231 fn test_normalize_binary_string_url_trims_certificate_host_tail_noise() {
2232 assert_eq!(
2233 normalize_binary_string_url("http://ocsp.digicert.com0/"),
2234 Some("http://ocsp.digicert.com/".to_string())
2235 );
2236 assert_eq!(
2237 normalize_binary_string_url("http://www.digicert.com1!0/"),
2238 Some("http://www.digicert.com/".to_string())
2239 );
2240 }
2241
2242 #[test]
2243 fn test_normalize_binary_string_url_trims_trailing_path_noise() {
2244 assert_eq!(
2245 normalize_binary_string_url(
2246 "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0_"
2247 ),
2248 Some(
2249 "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0".to_string()
2250 )
2251 );
2252 }
2253
2254 #[test]
2255 fn test_normalize_binary_string_url_preserves_clean_certificate_urls() {
2256 assert_eq!(
2257 normalize_binary_string_url("http://ocsp.digicert.com/"),
2258 Some("http://ocsp.digicert.com/".to_string())
2259 );
2260 assert_eq!(
2261 normalize_binary_string_url(
2262 "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0"
2263 ),
2264 Some(
2265 "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0".to_string()
2266 )
2267 );
2268 }
2269
2270 #[test]
2271 fn test_normalize_binary_string_url_does_not_trim_long_host_suffixes() {
2272 assert_eq!(
2273 normalize_binary_string_url("http://example.com0evil/"),
2274 None
2275 );
2276 }
2277
2278 #[test]
2279 fn test_normalize_binary_string_url_does_not_trim_legitimate_path_suffix() {
2280 assert_eq!(
2281 normalize_binary_string_url("http://example.com/path_/"),
2282 Some("http://example.com/path_/".to_string())
2283 );
2284 }
2285
2286 #[test]
2287 fn test_extract_email_url_information_caps_after_binary_normalization() {
2288 let mut builder = FileInfoBuilder::default();
2289 let text = [
2290 "http://ocsp.digicert.com0/",
2291 "http://ocsp.digicert.com0a/",
2292 "http://www.digicert.com1!0/",
2293 ]
2294 .join("\n");
2295 let options = TextDetectionOptions {
2296 detect_urls: true,
2297 max_urls: 2,
2298 ..TextDetectionOptions::default()
2299 };
2300
2301 extract_email_url_information(&mut builder, &text, &options, true);
2302 let file_info = builder
2303 .name("binary.txt".to_string())
2304 .base_name("binary".to_string())
2305 .extension(".txt".to_string())
2306 .path("binary.txt".to_string())
2307 .file_type(FileType::File)
2308 .size(0)
2309 .build()
2310 .expect("file info");
2311
2312 assert_eq!(file_info.urls.len(), 2);
2313 assert_eq!(file_info.urls[0].url, "http://ocsp.digicert.com/");
2314 assert_eq!(file_info.urls[1].url, "http://www.digicert.com/");
2315 }
2316
2317 #[test]
2318 fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2319 assert!(is_binary_string_author_candidate(
2320 "Andreas Schneider <asn@redhat.com>"
2321 ));
2322 }
2323
2324 #[test]
2325 fn test_binary_string_author_candidate_rejects_gibberish() {
2326 assert!(!is_binary_string_author_candidate(
2327 "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2328 ));
2329 }
2330
2331 #[test]
2332 fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2333 assert!(!is_binary_string_author_candidate(
2334 "Developers can enable them. - revert news user back to"
2335 ));
2336 }
2337
2338 #[test]
2339 fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2340 assert_eq!(
2341 extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2342 Some("Andreas Schneider <asn@redhat.com>".to_string())
2343 );
2344 }
2345
2346 #[test]
2347 fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2348 assert_eq!(
2349 extract_named_author_from_binary_line(
2350 "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2351 ),
2352 Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2353 );
2354 }
2355
2356 #[test]
2357 fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2358 assert_eq!(
2359 extract_named_author_from_binary_line(
2360 "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2361 ),
2362 None
2363 );
2364 }
2365
2366 #[test]
2367 fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2368 assert_eq!(
2369 extract_named_author_from_binary_line(
2370 "Changes as per initial review by panemade@gmail.com"
2371 ),
2372 Some("panemade@gmail.com".to_string())
2373 );
2374 }
2375
2376 #[test]
2377 fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2378 assert!(!is_binary_string_author_candidate(
2379 "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2380 ));
2381 }
2382
2383 #[test]
2384 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2385 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2386 let text = "alpha MIT omega";
2387 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2388 let mut detection = make_detection("");
2389 detection.matches[0].coordinates =
2390 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2391 detection.matches[0].start_token = 1;
2392 detection.matches[0].end_token = 2;
2393
2394 let percentage = compute_percentage_of_license_text(&query, &[detection]);
2395
2396 assert_eq!(percentage, 33.33);
2397 }
2398
2399 #[test]
2400 fn test_scan_options_fingerprint_changes_with_license_score() {
2401 let text_options = crate::scanner::TextDetectionOptions::default();
2402 let default_fingerprint = scan_options_fingerprint(
2403 &text_options,
2404 LicenseScanOptions {
2405 min_score: 0,
2406 ..LicenseScanOptions::default()
2407 },
2408 None,
2409 );
2410 let filtered_fingerprint = scan_options_fingerprint(
2411 &text_options,
2412 LicenseScanOptions {
2413 min_score: 70,
2414 ..LicenseScanOptions::default()
2415 },
2416 None,
2417 );
2418
2419 assert_ne!(default_fingerprint, filtered_fingerprint);
2420 }
2421
2422 #[test]
2423 fn test_is_go_non_production_source_for_test_filename() {
2424 let temp_dir = tempdir().unwrap();
2425 let path = temp_dir.path().join("scanner_test.go");
2426 fs::write(&path, "package scanner\n").unwrap();
2427
2428 assert!(is_go_non_production_source(&path).unwrap());
2429 }
2430
2431 #[test]
2432 fn test_is_go_non_production_source_for_build_tag() {
2433 let temp_dir = tempdir().unwrap();
2434 let path = temp_dir.path().join("scanner.go");
2435 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2436
2437 assert!(is_go_non_production_source(&path).unwrap());
2438 }
2439
2440 #[test]
2441 fn test_is_go_non_production_source_for_regular_go_file() {
2442 let temp_dir = tempdir().unwrap();
2443 let path = temp_dir.path().join("scanner.go");
2444 fs::write(&path, "package scanner\n").unwrap();
2445
2446 assert!(!is_go_non_production_source(&path).unwrap());
2447 }
2448}