1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3 is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29 Match, OutputEmail, OutputURL,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36 ExtractedTextKind, augment_license_detection_text, classify_file_info,
37 extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44 (
45 "-----BEGIN TRUSTED CERTIFICATE-----",
46 "-----END TRUSTED CERTIFICATE-----",
47 ),
48];
49
50pub fn process_collected(
51 collected: &CollectedPaths,
52 progress: Arc<ScanProgress>,
53 license_engine: Option<Arc<LicenseDetectionEngine>>,
54 license_options: LicenseScanOptions,
55 text_options: &TextDetectionOptions,
56) -> ProcessResult {
57 let mut all_files: Vec<FileInfo> = collected
58 .files
59 .par_iter()
60 .map(|(path, metadata)| {
61 let file_entry = process_file(
62 path,
63 metadata,
64 progress.as_ref(),
65 license_engine.clone(),
66 license_options,
67 text_options,
68 );
69 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70 file_entry
71 })
72 .collect();
73
74 for (path, metadata) in &collected.directories {
75 all_files.push(process_directory(
76 path,
77 metadata,
78 text_options.collect_info,
79 license_engine.is_some(),
80 ));
81 }
82
83 ProcessResult {
84 files: all_files,
85 excluded_count: collected.excluded_count,
86 }
87}
88
89pub fn process_collected_with_memory_limit(
90 collected: &CollectedPaths,
91 progress: Arc<ScanProgress>,
92 license_engine: Option<Arc<LicenseDetectionEngine>>,
93 license_options: LicenseScanOptions,
94 text_options: &TextDetectionOptions,
95 max_in_memory: i64,
96) -> ProcessResult {
97 if max_in_memory == 0 {
98 return process_collected(
99 collected,
100 progress,
101 license_engine,
102 license_options,
103 text_options,
104 );
105 }
106
107 let memory_limit = if max_in_memory < 0 {
108 0
109 } else {
110 max_in_memory as usize
111 };
112 let chunk_size = if max_in_memory < 0 {
113 256
114 } else {
115 memory_limit.max(1)
116 };
117
118 let mut retained_files = Vec::new();
119 let mut spill_store = None;
120
121 for chunk in collected.files.chunks(chunk_size) {
122 let processed_chunk: Vec<FileInfo> = chunk
123 .par_iter()
124 .map(|(path, metadata)| {
125 let file_entry = process_file(
126 path,
127 metadata,
128 progress.as_ref(),
129 license_engine.clone(),
130 license_options,
131 text_options,
132 );
133 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
134 file_entry
135 })
136 .collect();
137
138 retain_or_spill_chunk(
139 processed_chunk,
140 &mut retained_files,
141 &mut spill_store,
142 memory_limit,
143 );
144 }
145
146 for (path, metadata) in &collected.directories {
147 let entry = process_directory(
148 path,
149 metadata,
150 text_options.collect_info,
151 license_engine.is_some(),
152 );
153 retain_or_spill_chunk(
154 vec![entry],
155 &mut retained_files,
156 &mut spill_store,
157 memory_limit,
158 );
159 }
160
161 if let Some(spill_store) = spill_store {
162 retained_files.extend(spill_store.load_all());
163 }
164
165 ProcessResult {
166 files: retained_files,
167 excluded_count: collected.excluded_count,
168 }
169}
170
171fn retain_or_spill_chunk(
172 chunk: Vec<FileInfo>,
173 retained_files: &mut Vec<FileInfo>,
174 spill_store: &mut Option<FileInfoSpillStore>,
175 memory_limit: usize,
176) {
177 if memory_limit == 0 {
178 spill_store
179 .get_or_insert_with(FileInfoSpillStore::new)
180 .spill(chunk);
181 return;
182 }
183
184 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
185 if remaining_capacity >= chunk.len() && spill_store.is_none() {
186 retained_files.extend(chunk);
187 return;
188 }
189
190 let mut chunk_iter = chunk.into_iter();
191 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
192 let overflow: Vec<FileInfo> = chunk_iter.collect();
193 if !overflow.is_empty() {
194 spill_store
195 .get_or_insert_with(FileInfoSpillStore::new)
196 .spill(overflow);
197 }
198}
199
200struct FileInfoSpillStore {
201 temp_dir: TempDir,
202 batch_index: usize,
203}
204
205impl FileInfoSpillStore {
206 fn new() -> Self {
207 Self {
208 temp_dir: TempDir::new().expect("create spill dir"),
209 batch_index: 0,
210 }
211 }
212
213 fn spill(&mut self, files: Vec<FileInfo>) {
214 let path = self
215 .temp_dir
216 .path()
217 .join(format!("batch-{:06}.json.zst", self.batch_index));
218 self.batch_index += 1;
219
220 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
221 let file = File::create(path).expect("create spill batch file");
222 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
223 encoder
224 .write_all(&payload)
225 .expect("write spilled file batch");
226 encoder.finish().expect("finish spill encoder");
227 }
228
229 fn load_all(self) -> Vec<FileInfo> {
230 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
231 .expect("read spill dir")
232 .filter_map(Result::ok)
233 .map(|entry| entry.path())
234 .collect();
235 paths.sort();
236
237 let mut files = Vec::new();
238 for path in paths {
239 let file = File::open(path).expect("open spill batch");
240 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
241 let mut payload = Vec::new();
242 decoder.read_to_end(&mut payload).expect("read spill batch");
243 let mut batch: Vec<FileInfo> =
244 serde_json::from_slice(&payload).expect("decode spilled file batch");
245 files.append(&mut batch);
246 }
247 files
248 }
249}
250
251fn process_file(
252 path: &Path,
253 metadata: &fs::Metadata,
254 progress: &ScanProgress,
255 license_engine: Option<Arc<LicenseDetectionEngine>>,
256 license_options: LicenseScanOptions,
257 text_options: &TextDetectionOptions,
258) -> FileInfo {
259 let mut scan_errors: Vec<String> = vec![];
260 let mut file_info_builder = FileInfoBuilder::default();
261 let license_enabled = license_engine.is_some();
262
263 let started = Instant::now();
264
265 let mut generated_flag = None;
266 let mut is_source_file = false;
267 match extract_information_from_content(
268 &mut file_info_builder,
269 &mut scan_errors,
270 path,
271 progress,
272 license_engine,
273 license_options,
274 text_options,
275 ) {
276 Ok((is_generated, sha256, is_source)) => {
277 generated_flag = is_generated;
278 is_source_file = is_source;
279 let _ = sha256;
280 }
281 Err(e) => scan_errors.push(e.to_string()),
282 };
283
284 maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
285
286 let mut file_info = file_info_builder
287 .name(path.file_name().unwrap().to_string_lossy().to_string())
288 .base_name(
289 path.file_stem()
290 .unwrap_or_default()
291 .to_string_lossy()
292 .to_string(),
293 )
294 .extension(
295 path.extension()
296 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
297 )
298 .path(path.to_string_lossy().to_string())
299 .file_type(FileType::File)
300 .size(metadata.len())
301 .date(
302 text_options
303 .collect_info
304 .then(|| get_creation_date(metadata))
305 .flatten(),
306 )
307 .scan_errors(scan_errors)
308 .build()
309 .expect("FileInformationBuild not completely initialized");
310
311 if text_options.collect_info {
312 file_info.is_source = Some(is_source_file);
313 }
314
315 if file_info.programming_language.as_deref() == Some("Go")
316 && is_go_non_production_source(path).unwrap_or(false)
317 {
318 file_info.is_source = Some(false);
319 }
320
321 if text_options.detect_generated {
322 file_info.is_generated = Some(generated_flag.unwrap_or(false));
323 }
324
325 if file_info.percentage_of_license_text.is_none() && license_enabled {
326 file_info.percentage_of_license_text = Some(0.0);
327 }
328
329 file_info
330}
331
332fn extract_information_from_content(
333 file_info_builder: &mut FileInfoBuilder,
334 scan_errors: &mut Vec<String>,
335 path: &Path,
336 progress: &ScanProgress,
337 license_engine: Option<Arc<LicenseDetectionEngine>>,
338 license_options: LicenseScanOptions,
339 text_options: &TextDetectionOptions,
340) -> Result<(Option<bool>, String, bool), Error> {
341 let started = Instant::now();
342 let buffer = fs::read(path)?;
343 let license_enabled = license_engine.is_some();
344
345 if is_timeout_exceeded(started, text_options.timeout_seconds) {
346 return Err(Error::msg(format!(
347 "Timeout while reading file content (> {:.2}s)",
348 text_options.timeout_seconds
349 )));
350 }
351
352 let sha256 = calculate_sha256(&buffer);
353 let is_generated = text_options
354 .detect_generated
355 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
356 let classification = classify_file_info(path, &buffer);
357
358 if text_options.collect_info {
359 file_info_builder
360 .sha1(Some(calculate_sha1(&buffer)))
361 .md5(Some(calculate_md5(&buffer)))
362 .sha256(Some(sha256.clone()))
363 .programming_language(classification.programming_language.clone())
364 .mime_type(Some(classification.mime_type.clone()))
365 .file_type_label(Some(classification.file_type.clone()))
366 .sha1_git(Some(calculate_sha1_git(&buffer)))
367 .is_binary(Some(classification.is_binary))
368 .is_text(Some(classification.is_text))
369 .is_archive(Some(classification.is_archive))
370 .is_media(Some(classification.is_media))
371 .is_source(Some(classification.is_source))
372 .is_script(Some(classification.is_script))
373 .files_count(Some(0))
374 .dirs_count(Some(0))
375 .size_count(Some(0));
376 }
377
378 if should_skip_text_detection(path, &buffer) {
379 return Ok((is_generated, sha256, classification.is_source));
380 }
381
382 if text_options.detect_packages {
385 let started = Instant::now();
386 let parse_result = try_parse_file(path)
387 .or_else(|| {
388 text_options
389 .detect_application_packages
390 .then(|| try_parse_windows_executable_bytes(path, &buffer))
391 .flatten()
392 })
393 .or_else(|| {
394 text_options
395 .detect_packages_in_compiled
396 .then(|| {
397 (classification.is_binary && is_supported_compiled_binary_format(&buffer))
398 .then(|| try_parse_compiled_bytes(&buffer))
399 .flatten()
400 })
401 .flatten()
402 });
403
404 if let Some(parse_result) = parse_result {
405 let packages = parse_result
406 .packages
407 .into_iter()
408 .filter(|package| {
409 let is_compiled_package = package
410 .datasource_id
411 .as_ref()
412 .is_some_and(is_compiled_datasource);
413 let is_system_package = package
414 .datasource_id
415 .as_ref()
416 .is_some_and(is_system_datasource);
417 if is_compiled_package {
418 text_options.detect_packages_in_compiled
419 } else if is_system_package {
420 text_options.detect_system_packages
421 } else {
422 text_options.detect_application_packages
423 }
424 })
425 .collect();
426 file_info_builder.package_data(packages);
427 scan_errors.extend(parse_result.scan_errors);
428 }
429 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
430 }
431
432 if is_timeout_exceeded(started, text_options.timeout_seconds) {
433 return Err(Error::msg(format!(
434 "Timeout while extracting package/text metadata (> {:.2}s)",
435 text_options.timeout_seconds
436 )));
437 }
438
439 let (text_content, text_kind, text_scan_error) =
440 extract_text_for_detection_with_diagnostics(path, &buffer);
441 if let Some(text_scan_error) = text_scan_error {
442 scan_errors.push(text_scan_error);
443 }
444 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
445
446 if is_timeout_exceeded(started, text_options.timeout_seconds) {
447 return Err(Error::msg(format!(
448 "Timeout while extracting text content (> {:.2}s)",
449 text_options.timeout_seconds
450 )));
451 }
452
453 if text_content.is_empty() {
454 return Ok((is_generated, sha256, classification.is_source));
455 }
456
457 if text_options.detect_copyrights {
458 extract_copyright_information(
459 file_info_builder,
460 path,
461 &text_content,
462 text_options.timeout_seconds,
463 from_binary_strings,
464 );
465 }
466 extract_email_url_information(
467 file_info_builder,
468 &text_content,
469 text_options,
470 from_binary_strings,
471 );
472
473 if is_timeout_exceeded(started, text_options.timeout_seconds) {
474 return Err(Error::msg(format!(
475 "Timeout before license scan (> {:.2}s)",
476 text_options.timeout_seconds
477 )));
478 }
479 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
481 if let Some(sourcemap_content) =
482 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
483 {
484 sourcemap_content
485 } else {
486 text_content
487 }
488 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
489 remove_verbatim_escape_sequences(&text_content)
490 } else {
491 text_content
492 };
493 let text_content_for_license_detection =
494 augment_license_detection_text(path, &text_content_for_license_detection);
495 let text_content_for_license_detection = text_content_for_license_detection.into_owned();
496
497 if license_enabled {
498 let started = Instant::now();
499 extract_license_information(
500 file_info_builder,
501 scan_errors,
502 path,
503 text_content_for_license_detection.clone(),
504 license_engine,
505 license_options,
506 from_binary_strings,
507 )?;
508 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
509 } else {
510 extract_license_information(
511 file_info_builder,
512 scan_errors,
513 path,
514 text_content_for_license_detection,
515 license_engine,
516 license_options,
517 from_binary_strings,
518 )?;
519 }
520
521 Ok((is_generated, sha256, classification.is_source))
522}
523
524fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
525 timeout_seconds.is_finite()
526 && timeout_seconds > 0.0
527 && started.elapsed().as_secs_f64() > timeout_seconds
528}
529
530fn maybe_record_processing_timeout(
531 scan_errors: &mut Vec<String>,
532 started: Instant,
533 timeout_seconds: f64,
534) {
535 if is_timeout_exceeded(started, timeout_seconds)
536 && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
537 {
538 scan_errors.push(format!(
539 "Processing interrupted due to timeout after {:.2} seconds",
540 timeout_seconds
541 ));
542 }
543}
544
545fn is_timeout_scan_error(error: &str) -> bool {
546 error.contains("Timeout while ")
547 || error.contains("Timeout before ")
548 || error.contains("Processing interrupted due to timeout")
549}
550
551fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
552 matches!(
553 datasource_id,
554 DatasourceId::AlpineInstalledDb
555 | DatasourceId::DebianDistrolessInstalledDb
556 | DatasourceId::DebianInstalledFilesList
557 | DatasourceId::DebianInstalledMd5Sums
558 | DatasourceId::DebianInstalledStatusDb
559 | DatasourceId::FreebsdCompactManifest
560 | DatasourceId::RpmInstalledDatabaseBdb
561 | DatasourceId::RpmInstalledDatabaseNdb
562 | DatasourceId::RpmInstalledDatabaseSqlite
563 | DatasourceId::RpmYumdb
564 )
565}
566
567fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
568 matches!(
569 datasource_id,
570 DatasourceId::GoBinary | DatasourceId::RustBinary
571 )
572}
573
574fn extract_copyright_information(
575 file_info_builder: &mut FileInfoBuilder,
576 path: &Path,
577 text_content: &str,
578 timeout_seconds: f64,
579 from_binary_strings: bool,
580) {
581 if copyright::is_credits_file(path) {
583 let author_detections = copyright::detect_credits_authors(text_content);
584 if !author_detections.is_empty() {
585 file_info_builder.authors(
586 author_detections
587 .into_iter()
588 .map(|a| Author {
589 author: a.author,
590 start_line: a.start_line,
591 end_line: a.end_line,
592 })
593 .collect(),
594 );
595 return;
596 }
597 }
598
599 let copyright_options = CopyrightDetectionOptions {
600 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
601 Some(Duration::from_secs_f64(timeout_seconds))
602 } else {
603 None
604 },
605 ..CopyrightDetectionOptions::default()
606 };
607
608 let (copyrights, holders, authors) =
609 copyright::detect_copyrights_with_options(text_content, ©right_options);
610 let (copyrights, holders, authors) = if from_binary_strings {
611 prune_binary_string_detections(text_content, copyrights, holders, authors)
612 } else {
613 (copyrights, holders, authors)
614 };
615
616 file_info_builder.copyrights(
617 copyrights
618 .into_iter()
619 .map(|c| Copyright {
620 copyright: c.copyright,
621 start_line: c.start_line,
622 end_line: c.end_line,
623 })
624 .collect::<Vec<Copyright>>(),
625 );
626 file_info_builder.holders(
627 holders
628 .into_iter()
629 .map(|h| Holder {
630 holder: h.holder,
631 start_line: h.start_line,
632 end_line: h.end_line,
633 })
634 .collect::<Vec<Holder>>(),
635 );
636 file_info_builder.authors(
637 authors
638 .into_iter()
639 .map(|a| Author {
640 author: a.author,
641 start_line: a.start_line,
642 end_line: a.end_line,
643 })
644 .collect::<Vec<Author>>(),
645 );
646}
647
648fn prune_binary_string_detections(
649 text_content: &str,
650 copyrights: Vec<CopyrightDetection>,
651 holders: Vec<HolderDetection>,
652 authors: Vec<AuthorDetection>,
653) -> (
654 Vec<CopyrightDetection>,
655 Vec<HolderDetection>,
656 Vec<AuthorDetection>,
657) {
658 let kept_copyrights: Vec<CopyrightDetection> = copyrights
659 .into_iter()
660 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
661 .collect();
662
663 let kept_holders: Vec<HolderDetection> = holders
664 .into_iter()
665 .filter(|holder| {
666 kept_copyrights.iter().any(|copyright| {
667 ranges_overlap(
668 holder.start_line,
669 holder.end_line,
670 copyright.start_line,
671 copyright.end_line,
672 )
673 })
674 })
675 .collect();
676
677 let kept_authors = authors
678 .into_iter()
679 .filter(|author| is_binary_string_author_candidate(&author.author))
680 .chain(extract_binary_string_author_supplements(text_content))
681 .filter({
682 let mut seen = HashSet::new();
683 move |author| seen.insert(author.author.clone())
684 })
685 .collect();
686
687 (kept_copyrights, kept_holders, kept_authors)
688}
689
690fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
691 a_start <= b_end && b_start <= a_end
692}
693
694fn is_binary_string_copyright_candidate(text: &str) -> bool {
695 if contains_year(text) {
696 return true;
697 }
698
699 let trimmed = text.trim();
700 let lower = trimmed.to_ascii_lowercase();
701 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
702 tail.trim()
703 } else {
704 lower.trim()
705 };
706 let original_tail = if lower.starts_with("copyright") {
707 trimmed["copyright".len()..].trim()
708 } else {
709 trimmed
710 };
711
712 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
713 return false;
714 }
715
716 let alpha_tokens: Vec<&str> = tail
717 .split_whitespace()
718 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
719 .collect();
720
721 if alpha_tokens.len() <= 1 {
722 return has_explicit_copyright_marker(text)
723 && alpha_tokens.iter().any(|token| {
724 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
725 });
726 }
727
728 if !has_explicit_copyright_marker(text) {
729 return false;
730 }
731
732 has_binary_name_like_shape(original_tail)
733}
734
735fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
736 let mut authors = Vec::new();
737
738 for (line_index, line) in text_content.lines().enumerate() {
739 if let Some(author) = extract_named_author_from_binary_line(line) {
740 authors.push(AuthorDetection {
741 author,
742 start_line: line_index + 1,
743 end_line: line_index + 1,
744 });
745 }
746 }
747
748 authors
749}
750
751fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
752 let line = line.trim();
753 if line.is_empty() {
754 return None;
755 }
756
757 let emails = finder::find_emails(
758 line,
759 &DetectionConfig {
760 max_emails: 4,
761 max_urls: 0,
762 unique: false,
763 },
764 );
765 let email = emails.first()?.email.as_str();
766 if !is_binary_string_email_candidate(email) {
767 return None;
768 }
769
770 let lower_line = line.to_ascii_lowercase();
771 let email_start = lower_line.find(email)?;
772 let raw_prefix = &line[..email_start];
773 let has_author_marker = contains_binary_author_marker(raw_prefix);
774 let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
775 let prefix = prefix
776 .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
777 .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
778 .trim();
779
780 let (name, _) = split_name_email(prefix);
781 let name = name.or_else(|| {
782 let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
783 (!trimmed.is_empty()).then(|| trimmed.to_string())
784 });
785
786 let Some(name) = name.map(|name| name.trim().to_string()) else {
787 if has_author_marker {
788 return Some(email.to_string());
789 }
790 return None;
791 };
792
793 if name.is_empty() && has_author_marker {
794 return Some(email.to_string());
795 }
796
797 if !has_binary_name_like_shape(&name) {
798 return None;
799 }
800
801 if line.contains(&format!("<{email}>")) {
802 Some(format!("{name} <{email}>"))
803 } else if line.contains(&format!("({email})")) {
804 Some(format!("{name} ({email})"))
805 } else {
806 Some(format!("{name} {email}"))
807 }
808}
809
810fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
811 let lower = text.to_ascii_lowercase();
812 let idx = lower.rfind(marker)?;
813 Some(text[idx + marker.len()..].trim())
814}
815
816fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
817 const MARKERS: &[&str] = &[
818 " patch author: ",
819 " patch author ",
820 " written by ",
821 " contributed by ",
822 " original work done by ",
823 " work done by ",
824 " thanks to ",
825 " review by ",
826 " by ",
827 " from ",
828 ];
829
830 MARKERS
831 .iter()
832 .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
833 .next()
834}
835
836fn contains_binary_author_marker(text: &str) -> bool {
837 take_suffix_after_last_author_marker(text).is_some()
838}
839
840fn has_binary_name_like_shape(text: &str) -> bool {
841 let trimmed = text.trim();
842 if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
843 {
844 return false;
845 }
846
847 let tokens: Vec<&str> = trimmed
848 .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
849 .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
850 .collect();
851 if tokens.is_empty() {
852 return false;
853 }
854
855 let uppercase_like = tokens
856 .iter()
857 .filter(|token| {
858 let token = token.trim_matches('.');
859 token
860 .chars()
861 .find(|c| c.is_ascii_alphabetic())
862 .is_some_and(|c| c.is_ascii_uppercase())
863 })
864 .count();
865
866 uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
867 || tokens
868 .iter()
869 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
870}
871
872fn has_sufficient_alphabetic_content(text: &str) -> bool {
873 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
874 if alnum_count == 0 {
875 return false;
876 }
877
878 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
879 alpha_count * 2 >= alnum_count
880}
881
882fn has_excessive_at_noise(text: &str) -> bool {
883 text.chars().filter(|c| *c == '@').count() >= 3
884}
885
886fn has_explicit_copyright_marker(text: &str) -> bool {
887 let lower = text.to_ascii_lowercase();
888 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
889}
890
891fn contains_year(text: &str) -> bool {
892 let bytes = text.as_bytes();
893 bytes.windows(4).any(|window| {
894 window.iter().all(|b| b.is_ascii_digit())
895 && matches!(window[0], b'1' | b'2')
896 && matches!(window[1], b'9' | b'0')
897 })
898}
899
900fn is_company_like_suffix(token: &str) -> bool {
901 matches!(
902 token.to_ascii_lowercase().as_str(),
903 "inc"
904 | "corp"
905 | "corporation"
906 | "co"
907 | "company"
908 | "ltd"
909 | "llc"
910 | "gmbh"
911 | "foundation"
912 | "project"
913 | "systems"
914 | "software"
915 | "technologies"
916 | "technology"
917 )
918}
919
920fn extract_email_url_information(
921 file_info_builder: &mut FileInfoBuilder,
922 text_content: &str,
923 text_options: &TextDetectionOptions,
924 from_binary_strings: bool,
925) {
926 if !text_options.detect_emails && !text_options.detect_urls {
927 return;
928 }
929
930 if text_options.detect_emails {
931 let config = DetectionConfig {
932 max_emails: text_options.max_emails,
933 max_urls: text_options.max_urls,
934 unique: from_binary_strings,
935 };
936 let emails = finder::find_emails(text_content, &config)
937 .into_iter()
938 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
939 .map(|d| OutputEmail {
940 email: d.email,
941 start_line: d.start_line,
942 end_line: d.end_line,
943 })
944 .collect::<Vec<_>>();
945 file_info_builder.emails(emails);
946 }
947
948 if text_options.detect_urls {
949 let config = DetectionConfig {
950 max_emails: text_options.max_emails,
951 max_urls: text_options.max_urls,
952 unique: true,
953 };
954 let urls = finder::find_urls(text_content, &config)
955 .into_iter()
956 .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
957 .map(|d| OutputURL {
958 url: d.url,
959 start_line: d.start_line,
960 end_line: d.end_line,
961 })
962 .collect::<Vec<_>>();
963 file_info_builder.urls(urls);
964 }
965}
966
967fn is_binary_string_email_candidate(email: &str) -> bool {
968 let Some((local, domain)) = email.rsplit_once('@') else {
969 return false;
970 };
971
972 if !has_strong_binary_local_part(local) {
973 return false;
974 }
975
976 has_strong_binary_host_shape(domain)
977}
978
979fn is_binary_string_url_candidate(url: &str) -> bool {
980 let parsed = url::Url::parse(url).ok();
981 let Some(parsed) = parsed else {
982 return false;
983 };
984 let Some(host) = parsed.host_str() else {
985 return false;
986 };
987
988 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
989}
990
991fn is_binary_string_author_candidate(author: &str) -> bool {
992 let trimmed = author.trim();
993 if trimmed.is_empty()
994 || !has_sufficient_alphabetic_content(trimmed)
995 || has_excessive_at_noise(trimmed)
996 {
997 return false;
998 }
999
1000 if trimmed.contains('@') {
1001 let emails = finder::find_emails(
1002 trimmed,
1003 &DetectionConfig {
1004 max_emails: 4,
1005 max_urls: 0,
1006 unique: true,
1007 },
1008 );
1009 if emails.len() > 1 {
1010 return false;
1011 }
1012
1013 if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1014 return !extracted.is_empty();
1015 }
1016
1017 let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1018 return false;
1019 };
1020 if !is_binary_string_email_candidate(email) {
1021 return false;
1022 }
1023
1024 let (name, _) = split_name_email(trimmed);
1025 return name.as_deref().is_some_and(has_binary_name_like_shape);
1026 }
1027
1028 has_binary_name_like_shape(trimmed)
1029}
1030
1031fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1032 if parsed.path() != "/"
1033 && parsed
1034 .path()
1035 .split('/')
1036 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1037 {
1038 return true;
1039 }
1040
1041 if parsed.query().is_some() || parsed.fragment().is_some() {
1042 return true;
1043 }
1044
1045 let Some(host) = parsed.host_str() else {
1046 return false;
1047 };
1048
1049 let labels: Vec<&str> = host.split('.').collect();
1050 if labels.len() > 2 {
1051 return labels[..labels.len() - 1].iter().any(|label| {
1052 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1053 });
1054 }
1055
1056 if matches!(labels.first(), Some(&"www")) {
1057 return true;
1058 }
1059
1060 if labels.len() == 2 {
1061 let domain = labels[0];
1062 let tld = labels[1];
1063 if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1064 return true;
1065 }
1066 }
1067
1068 labels
1069 .iter()
1070 .take(labels.len().saturating_sub(1))
1071 .any(|label| {
1072 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1073 })
1074}
1075
1076fn has_strong_binary_local_part(local: &str) -> bool {
1077 local
1078 .split(|c: char| !c.is_ascii_alphabetic())
1079 .any(|segment| segment.len() >= 3)
1080}
1081
1082fn has_strong_binary_host_shape(host: &str) -> bool {
1083 let labels: Vec<&str> = host.split('.').collect();
1084 if labels.len() < 2 {
1085 return false;
1086 }
1087
1088 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1089 &labels[1..]
1090 } else {
1091 &labels[..]
1092 };
1093
1094 if relevant.len() < 2 {
1095 return false;
1096 }
1097
1098 relevant[..relevant.len() - 1].iter().any(|label| {
1099 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1100 })
1101}
1102
1103fn extract_license_information(
1104 file_info_builder: &mut FileInfoBuilder,
1105 scan_errors: &mut Vec<String>,
1106 path: &Path,
1107 text_content: String,
1108 license_engine: Option<Arc<LicenseDetectionEngine>>,
1109 license_options: LicenseScanOptions,
1110 from_binary_strings: bool,
1111) -> Result<(), Error> {
1112 let Some(engine) = license_engine else {
1113 return Ok(());
1114 };
1115
1116 let detection_result = if license_options.min_score == 0 {
1117 engine.detect_with_kind_and_source(
1118 &text_content,
1119 license_options.unknown_licenses,
1120 from_binary_strings,
1121 &path.to_string_lossy(),
1122 )
1123 } else {
1124 engine.detect_with_kind_and_source_with_score(
1125 &text_content,
1126 license_options.unknown_licenses,
1127 from_binary_strings,
1128 &path.to_string_lossy(),
1129 license_options.min_score as f32,
1130 )
1131 };
1132
1133 match detection_result {
1134 Ok(detections) => {
1135 let query =
1136 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1137 let mut model_detections = Vec::new();
1138 let mut model_clues = Vec::new();
1139
1140 for detection in &detections {
1141 let (public_detection, clue_matches) = convert_detection_to_model(
1142 detection,
1143 license_options,
1144 &text_content,
1145 query.as_ref(),
1146 );
1147
1148 if let Some(public_detection) = public_detection {
1149 model_detections.push(public_detection);
1150 }
1151
1152 model_clues.extend(clue_matches);
1153 }
1154
1155 if !model_detections.is_empty() {
1156 let expressions: Vec<String> = model_detections
1157 .iter()
1158 .filter(|d| !d.license_expression_spdx.is_empty())
1159 .map(|d| d.license_expression_spdx.clone())
1160 .collect();
1161
1162 if !expressions.is_empty() {
1163 let combined = crate::utils::spdx::combine_license_expressions(expressions);
1164 if let Some(expr) = combined {
1165 file_info_builder.license_expression(Some(expr));
1166 }
1167 }
1168 }
1169
1170 file_info_builder.license_detections(model_detections);
1171 file_info_builder.license_clues(model_clues);
1172 file_info_builder.percentage_of_license_text(
1173 query
1174 .as_ref()
1175 .map(|query| compute_percentage_of_license_text(query, &detections)),
1176 );
1177 }
1178 Err(e) => {
1179 scan_errors.push(format!("License detection failed: {}", e));
1180 }
1181 }
1182
1183 Ok(())
1184}
1185
1186fn convert_detection_to_model(
1187 detection: &crate::license_detection::LicenseDetection,
1188 license_options: LicenseScanOptions,
1189 text_content: &str,
1190 query: Option<&Query<'_>>,
1191) -> (Option<LicenseDetection>, Vec<Match>) {
1192 let matches: Vec<Match> = detection
1193 .matches
1194 .iter()
1195 .map(|m| convert_match_to_model(m, license_options, text_content, query))
1196 .collect();
1197
1198 if let Some(license_expression) = detection.license_expression.clone() {
1199 (
1200 Some(LicenseDetection {
1201 license_expression,
1202 license_expression_spdx: detection
1203 .license_expression_spdx
1204 .clone()
1205 .unwrap_or_default(),
1206 matches,
1207 detection_log: if license_options.include_diagnostics {
1208 detection.detection_log.clone()
1209 } else {
1210 Vec::new()
1211 },
1212 identifier: detection.identifier.clone(),
1213 }),
1214 Vec::new(),
1215 )
1216 } else {
1217 (None, matches)
1218 }
1219}
1220
1221fn convert_match_to_model(
1222 m: &crate::license_detection::models::LicenseMatch,
1223 license_options: LicenseScanOptions,
1224 text_content: &str,
1225 query: Option<&Query<'_>>,
1226) -> Match {
1227 let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
1228 let rule_url = if m.rule_url.is_empty() {
1229 None
1230 } else {
1231 Some(m.rule_url.clone())
1232 };
1233 let matched_text = if license_options.include_text {
1234 m.matched_text.clone().or_else(|| {
1235 Some(crate::license_detection::query::matched_text_from_text(
1236 text_content,
1237 m.start_line,
1238 m.end_line,
1239 ))
1240 })
1241 } else {
1242 None
1243 };
1244 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1245 query.map(|query| matched_text_diagnostics_from_match(query, m))
1246 } else {
1247 None
1248 };
1249 Match {
1250 license_expression: m.license_expression.clone(),
1251 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1252 from_file: m.from_file.clone(),
1253 start_line: m.start_line,
1254 end_line: m.end_line,
1255 matcher: Some(m.matcher.to_string()),
1256 score: output_metric(m.score),
1257 matched_length: Some(m.matched_length),
1258 match_coverage: Some(output_metric(m.coverage())),
1259 rule_relevance: Some(m.rule_relevance as usize),
1260 rule_identifier: Some(m.rule_identifier.clone()),
1261 rule_url,
1262 matched_text,
1263 referenced_filenames: m.referenced_filenames.clone(),
1264 matched_text_diagnostics,
1265 }
1266}
1267
1268fn compute_percentage_of_license_text(
1269 query: &Query<'_>,
1270 detections: &[crate::license_detection::LicenseDetection],
1271) -> f64 {
1272 let matched_positions: std::collections::HashSet<usize> = detections
1273 .iter()
1274 .flat_map(|detection| detection.matches.iter())
1275 .flat_map(|m| m.query_span().iter())
1276 .collect();
1277
1278 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1279 if query_tokens_length == 0 {
1280 return 0.0;
1281 }
1282
1283 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1284 (percentage * 100.0).round() / 100.0
1285}
1286
1287fn matched_text_diagnostics_from_match(
1288 query: &Query<'_>,
1289 license_match: &InternalLicenseMatch,
1290) -> String {
1291 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1292 let Some(start_pos) = matched_positions.iter().min() else {
1293 return crate::license_detection::query::matched_text_from_text(
1294 &query.text,
1295 license_match.start_line,
1296 license_match.end_line,
1297 );
1298 };
1299 let Some(end_pos) = matched_positions.iter().max() else {
1300 return crate::license_detection::query::matched_text_from_text(
1301 &query.text,
1302 license_match.start_line,
1303 license_match.end_line,
1304 );
1305 };
1306
1307 crate::license_detection::query::matched_text_diagnostics_from_text(
1308 &query.text,
1309 query,
1310 &matched_positions,
1311 start_pos,
1312 end_pos,
1313 license_match.start_line,
1314 license_match.end_line,
1315 )
1316}
1317
1318fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1319 is_pem_certificate_file(path, buffer)
1320}
1321
1322fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1323 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1324 return Ok(false);
1325 }
1326
1327 if path
1328 .file_name()
1329 .and_then(|name| name.to_str())
1330 .is_some_and(|name| name.ends_with("_test.go"))
1331 {
1332 return Ok(true);
1333 }
1334
1335 let content = fs::read_to_string(path)?;
1336 Ok(content.lines().take(10).any(|line| {
1337 let trimmed = line.trim();
1338 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1339 && trimmed.split_whitespace().any(|token| token == "test")
1340 }))
1341}
1342
1343fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1344 let prefix_len = buffer.len().min(8192);
1345 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1346 let trimmed_lines: Vec<&str> = prefix
1347 .lines()
1348 .map(str::trim)
1349 .filter(|line| !line.is_empty())
1350 .take(64)
1351 .collect();
1352
1353 let Some(first_line) = trimmed_lines.first().copied() else {
1354 return false;
1355 };
1356
1357 PEM_CERTIFICATE_HEADERS
1358 .iter()
1359 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1360}
1361
1362fn process_directory(
1363 path: &Path,
1364 _metadata: &fs::Metadata,
1365 collect_info: bool,
1366 license_enabled: bool,
1367) -> FileInfo {
1368 let name = path
1369 .file_name()
1370 .unwrap_or_default()
1371 .to_string_lossy()
1372 .to_string();
1373 let base_name = name.clone(); FileInfo {
1376 name,
1377 base_name,
1378 extension: "".to_string(),
1379 path: path.to_string_lossy().to_string(),
1380 file_type: FileType::Directory,
1381 mime_type: None,
1382 file_type_label: None,
1383 size: 0,
1384 date: None,
1385 sha1: None,
1386 md5: None,
1387 sha256: None,
1388 sha1_git: None,
1389 programming_language: None,
1390 package_data: Vec::new(),
1391 license_expression: None,
1392 license_detections: Vec::new(),
1393 license_clues: Vec::new(),
1394 percentage_of_license_text: license_enabled.then_some(0.0),
1395 copyrights: Vec::new(),
1396 holders: Vec::new(),
1397 authors: Vec::new(),
1398 emails: Vec::new(),
1399 urls: Vec::new(),
1400 for_packages: Vec::new(),
1401 scan_errors: Vec::new(),
1402 license_policy: None,
1403 is_binary: collect_info.then_some(false),
1404 is_text: collect_info.then_some(false),
1405 is_archive: collect_info.then_some(false),
1406 is_media: collect_info.then_some(false),
1407 is_source: collect_info.then_some(false),
1408 is_script: collect_info.then_some(false),
1409 files_count: collect_info.then_some(0),
1410 dirs_count: collect_info.then_some(0),
1411 size_count: collect_info.then_some(0),
1412 source_count: None,
1413 is_legal: false,
1414 is_manifest: false,
1415 is_readme: false,
1416 is_top_level: false,
1417 is_key_file: false,
1418 is_community: false,
1419 is_generated: None,
1420 facets: vec![],
1421 tallies: None,
1422 }
1423}
1424
1425#[cfg(test)]
1426mod tests {
1427 use super::{
1428 compute_percentage_of_license_text, convert_detection_to_model,
1429 extract_email_url_information, extract_named_author_from_binary_line,
1430 is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1431 is_binary_string_email_candidate, is_binary_string_url_candidate,
1432 is_go_non_production_source, process_file,
1433 };
1434 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1435 use crate::license_detection::index::LicenseIndex;
1436 use crate::license_detection::index::dictionary::TokenDictionary;
1437 use crate::license_detection::models::position_span::PositionSpan;
1438 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1439 use crate::license_detection::query::Query;
1440 use crate::models::{FileInfoBuilder, FileType};
1441 use crate::progress::{ProgressMode, ScanProgress};
1442 use crate::scanner::scan_options_fingerprint;
1443 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1444 use std::fs;
1445 use std::time::{Duration, Instant};
1446 use tempfile::tempdir;
1447
1448 use super::maybe_record_processing_timeout;
1449
1450 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1451 LicenseMatch {
1452 rid: 0,
1453 license_expression: "mit".to_string(),
1454 license_expression_spdx: Some("MIT".to_string()),
1455 from_file: None,
1456 start_line: 1,
1457 end_line: 1,
1458 start_token: 0,
1459 end_token: 1,
1460 matcher: MatcherKind::Hash,
1461 score: 1.0,
1462 matched_length: 3,
1463 rule_length: 3,
1464 match_coverage: 100.0,
1465 rule_relevance: 100,
1466 rule_identifier: "mit.LICENSE".to_string(),
1467 rule_url: rule_url.to_string(),
1468 matched_text: Some("MIT".to_string()),
1469 referenced_filenames: None,
1470 rule_kind: RuleKind::Text,
1471 is_from_license: true,
1472 rule_start_token: 0,
1473 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1474 candidate_resemblance: 0.0,
1475 candidate_containment: 0.0,
1476 }
1477 }
1478
1479 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1480 InternalLicenseDetection {
1481 license_expression: Some("mit".to_string()),
1482 license_expression_spdx: Some("MIT".to_string()),
1483 matches: vec![make_internal_match(rule_url)],
1484 detection_log: vec![],
1485 identifier: Some("mit-test".to_string()),
1486 file_regions: Vec::new(),
1487 }
1488 }
1489
1490 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1491 let dictionary = TokenDictionary::new_with_legalese(entries);
1492 let mut index = LicenseIndex::new(dictionary);
1493 index.len_legalese = len_legalese;
1494 index
1495 }
1496
1497 #[test]
1498 fn test_convert_detection_to_model_preserves_rule_url() {
1499 let detection = make_detection(
1500 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1501 );
1502
1503 let (converted, clues) =
1504 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1505 let converted = converted.expect("detection should convert");
1506
1507 assert_eq!(
1508 converted.matches[0].rule_url.as_deref(),
1509 Some(
1510 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1511 )
1512 );
1513 assert!(clues.is_empty());
1514 }
1515
1516 #[test]
1517 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1518 let detection = make_detection("");
1519
1520 let (converted, clues) =
1521 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1522 let converted = converted.expect("detection should convert");
1523
1524 assert_eq!(converted.matches[0].rule_url, None);
1525 assert!(clues.is_empty());
1526 }
1527
1528 #[test]
1529 fn test_convert_detection_to_model_rounds_match_coverage() {
1530 let mut detection = make_detection("");
1531 detection.matches[0].score = 81.82;
1532 detection.matches[0].match_coverage = 33.334;
1533
1534 let (converted, clues) =
1535 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1536 let converted = converted.expect("detection should convert");
1537
1538 assert_eq!(converted.matches[0].score, 81.82);
1539 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1540 assert!(clues.is_empty());
1541 }
1542
1543 #[test]
1544 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1545 let mut detection = make_detection(
1546 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1547 );
1548 detection.license_expression = None;
1549 detection.license_expression_spdx = None;
1550 detection.identifier = None;
1551 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1552 detection.matches[0].license_expression_spdx =
1553 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1554 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1555 detection.matches[0].rule_kind = RuleKind::Clue;
1556
1557 let (converted, clues) = convert_detection_to_model(
1558 &detection,
1559 LicenseScanOptions {
1560 include_text: true,
1561 min_score: 0,
1562 ..LicenseScanOptions::default()
1563 },
1564 "clue text",
1565 None,
1566 );
1567
1568 assert!(converted.is_none());
1569 assert_eq!(clues.len(), 1);
1570 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1571 assert_eq!(
1572 clues[0].license_expression_spdx,
1573 "LicenseRef-scancode-unknown-license-reference"
1574 );
1575 assert_eq!(
1576 clues[0].rule_identifier.as_deref(),
1577 Some("license-clue_1.RULE")
1578 );
1579 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1580 assert_eq!(clues[0].matched_text_diagnostics, None);
1581 }
1582
1583 #[test]
1584 fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1585 let dir = tempdir().expect("tempdir");
1586 let path = dir.path().join("broken.pdf");
1587 fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1588 .expect("write malformed pdf");
1589 let metadata = fs::metadata(&path).expect("metadata");
1590 let progress = ScanProgress::new(ProgressMode::Quiet);
1591
1592 let file_info = process_file(
1593 &path,
1594 &metadata,
1595 &progress,
1596 None,
1597 LicenseScanOptions::default(),
1598 &TextDetectionOptions::default(),
1599 );
1600
1601 assert!(file_info.scan_errors.is_empty());
1602 }
1603
1604 #[test]
1605 fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1606 let started = Instant::now() - Duration::from_secs(2);
1607 let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1608
1609 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1610
1611 assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1612 }
1613
1614 #[test]
1615 fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1616 let started = Instant::now() - Duration::from_secs(2);
1617 let mut scan_errors = Vec::new();
1618
1619 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1620
1621 assert_eq!(
1622 scan_errors,
1623 vec!["Processing interrupted due to timeout after 1.00 seconds"]
1624 );
1625 }
1626
1627 #[test]
1628 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1629 let text = concat!(
1630 "Reproduction and distribution of this file, with or without modification, are\n",
1631 "permitted in any medium without royalties provided the copyright notice\n",
1632 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1633 );
1634 let index = create_test_index(
1635 &[
1636 ("reproduction", 0),
1637 ("distribution", 1),
1638 ("file", 2),
1639 ("without", 3),
1640 ("modification", 4),
1641 ("permitted", 5),
1642 ("medium", 6),
1643 ("royalties", 7),
1644 ("provided", 8),
1645 ("copyright", 9),
1646 ("notice", 10),
1647 ("preserved", 11),
1648 ("offered", 12),
1649 ("warranties", 13),
1650 ],
1651 14,
1652 );
1653 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1654 let mut detection = make_detection(
1655 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1656 );
1657 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1658 detection.matches[0].license_expression = "fsf-ap".to_string();
1659 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1660 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1661 detection.matches[0].matched_text = None;
1662 detection.matches[0].start_line = 1;
1663 detection.matches[0].end_line = 3;
1664 detection.matches[0].start_token = 0;
1665 detection.matches[0].end_token = query.tokens.len();
1666 detection.matches[0].coordinates =
1667 MatchCoordinates::query_region(PositionSpan::from_positions(
1668 query
1669 .tokens
1670 .iter()
1671 .enumerate()
1672 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1673 .collect::<Vec<_>>(),
1674 ));
1675 detection.identifier = Some("fsf_ap-test".to_string());
1676
1677 let (converted, clues) = convert_detection_to_model(
1678 &detection,
1679 LicenseScanOptions {
1680 include_text: true,
1681 include_text_diagnostics: true,
1682 include_diagnostics: true,
1683 unknown_licenses: false,
1684 min_score: 0,
1685 },
1686 text,
1687 Some(&query),
1688 );
1689 let converted = converted.expect("detection should convert");
1690
1691 assert!(clues.is_empty());
1692 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1693 assert_eq!(
1694 converted.matches[0].matched_text.as_deref(),
1695 Some(text.trim_end())
1696 );
1697 let diagnostics = converted.matches[0]
1698 .matched_text_diagnostics
1699 .as_deref()
1700 .expect("diagnostics should be present");
1701 assert!(diagnostics.contains('['));
1702 assert!(diagnostics.contains(']'));
1703 assert_ne!(diagnostics, text.trim_end());
1704 }
1705
1706 #[test]
1707 fn test_extract_email_url_information_skips_binary_string_text() {
1708 let mut builder = FileInfoBuilder::default();
1709 let options = TextDetectionOptions {
1710 collect_info: false,
1711 detect_packages: false,
1712 detect_application_packages: false,
1713 detect_system_packages: false,
1714 detect_packages_in_compiled: false,
1715 detect_copyrights: false,
1716 detect_generated: false,
1717 detect_emails: true,
1718 detect_urls: true,
1719 max_emails: 50,
1720 max_urls: 50,
1721 timeout_seconds: 120.0,
1722 };
1723
1724 extract_email_url_information(
1725 &mut builder,
1726 "contact 6h@fo.lwft and visit http://gmail.com/",
1727 &options,
1728 true,
1729 );
1730
1731 let file = builder
1732 .name("binary.bin".to_string())
1733 .base_name("binary".to_string())
1734 .extension(".bin".to_string())
1735 .path("binary.bin".to_string())
1736 .file_type(FileType::File)
1737 .size(1)
1738 .build()
1739 .expect("builder should produce file info");
1740
1741 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1742 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1743 }
1744
1745 #[test]
1746 fn test_extract_email_url_information_keeps_good_binary_contacts() {
1747 let mut builder = FileInfoBuilder::default();
1748 let options = TextDetectionOptions {
1749 collect_info: false,
1750 detect_packages: false,
1751 detect_application_packages: false,
1752 detect_system_packages: false,
1753 detect_packages_in_compiled: false,
1754 detect_copyrights: false,
1755 detect_generated: false,
1756 detect_emails: true,
1757 detect_urls: true,
1758 max_emails: 50,
1759 max_urls: 50,
1760 timeout_seconds: 120.0,
1761 };
1762
1763 extract_email_url_information(
1764 &mut builder,
1765 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1766 &options,
1767 true,
1768 );
1769
1770 let file = builder
1771 .name("binary.bin".to_string())
1772 .base_name("binary".to_string())
1773 .extension(".bin".to_string())
1774 .path("binary.bin".to_string())
1775 .file_type(FileType::File)
1776 .size(1)
1777 .build()
1778 .expect("builder should produce file info");
1779
1780 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1781 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1782 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1783 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1784 }
1785
1786 #[test]
1787 fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1788 let mut builder = FileInfoBuilder::default();
1789 let options = TextDetectionOptions {
1790 collect_info: false,
1791 detect_packages: false,
1792 detect_application_packages: false,
1793 detect_system_packages: false,
1794 detect_packages_in_compiled: false,
1795 detect_copyrights: false,
1796 detect_generated: false,
1797 detect_emails: true,
1798 detect_urls: false,
1799 max_emails: 2,
1800 max_urls: 50,
1801 timeout_seconds: 120.0,
1802 };
1803
1804 extract_email_url_information(
1805 &mut builder,
1806 "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1807 &options,
1808 true,
1809 );
1810
1811 let file = builder
1812 .name("binary.bin".to_string())
1813 .base_name("binary".to_string())
1814 .extension(".bin".to_string())
1815 .path("binary.bin".to_string())
1816 .file_type(FileType::File)
1817 .size(1)
1818 .build()
1819 .expect("builder should produce file info");
1820
1821 assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1822 assert_eq!(file.emails[0].email, "jakub@redhat.com");
1823 assert_eq!(file.emails[1].email, "contyk@redhat.com");
1824 }
1825
1826 #[test]
1827 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1828 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1829 assert!(!is_binary_string_copyright_candidate(gibberish));
1830 }
1831
1832 #[test]
1833 fn test_binary_string_copyright_candidate_keeps_real_notice() {
1834 let notice = "Copyright nexB and others (c) 2012";
1835 assert!(is_binary_string_copyright_candidate(notice));
1836 }
1837
1838 #[test]
1839 fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1840 assert!(!is_binary_string_copyright_candidate(
1841 "Copyright - split out libs"
1842 ));
1843 }
1844
1845 #[test]
1846 fn test_binary_string_email_candidate_rejects_gibberish() {
1847 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1848 }
1849
1850 #[test]
1851 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1852 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1853 }
1854
1855 #[test]
1856 fn test_binary_string_url_candidate_rejects_short_fake_host() {
1857 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1858 }
1859
1860 #[test]
1861 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1862 assert!(is_binary_string_url_candidate(
1863 "https://www.gnu.org/software/coreutils/"
1864 ));
1865 }
1866
1867 #[test]
1868 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1869 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1870 }
1871
1872 #[test]
1873 fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
1874 assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
1875 }
1876
1877 #[test]
1878 fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
1879 assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
1880 }
1881
1882 #[test]
1883 fn test_binary_string_url_candidate_keeps_short_project_path() {
1884 assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
1885 }
1886
1887 #[test]
1888 fn test_binary_string_author_candidate_keeps_named_author_with_email() {
1889 assert!(is_binary_string_author_candidate(
1890 "Andreas Schneider <asn@redhat.com>"
1891 ));
1892 }
1893
1894 #[test]
1895 fn test_binary_string_author_candidate_rejects_gibberish() {
1896 assert!(!is_binary_string_author_candidate(
1897 "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
1898 ));
1899 }
1900
1901 #[test]
1902 fn test_binary_string_author_candidate_rejects_changelog_phrase() {
1903 assert!(!is_binary_string_author_candidate(
1904 "Developers can enable them. - revert news user back to"
1905 ));
1906 }
1907
1908 #[test]
1909 fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
1910 assert_eq!(
1911 extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
1912 Some("Andreas Schneider <asn@redhat.com>".to_string())
1913 );
1914 }
1915
1916 #[test]
1917 fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
1918 assert_eq!(
1919 extract_named_author_from_binary_line(
1920 "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
1921 ),
1922 Some("Rob Crittenden (rcritten@redhat.com)".to_string())
1923 );
1924 }
1925
1926 #[test]
1927 fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
1928 assert_eq!(
1929 extract_named_author_from_binary_line(
1930 "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
1931 ),
1932 None
1933 );
1934 }
1935
1936 #[test]
1937 fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
1938 assert_eq!(
1939 extract_named_author_from_binary_line(
1940 "Changes as per initial review by panemade@gmail.com"
1941 ),
1942 Some("panemade@gmail.com".to_string())
1943 );
1944 }
1945
1946 #[test]
1947 fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
1948 assert!(!is_binary_string_author_candidate(
1949 "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
1950 ));
1951 }
1952
1953 #[test]
1954 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1955 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1956 let text = "alpha MIT omega";
1957 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1958 let mut detection = make_detection("");
1959 detection.matches[0].coordinates =
1960 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1961 detection.matches[0].start_token = 1;
1962 detection.matches[0].end_token = 2;
1963
1964 let percentage = compute_percentage_of_license_text(&query, &[detection]);
1965
1966 assert_eq!(percentage, 33.33);
1967 }
1968
1969 #[test]
1970 fn test_scan_options_fingerprint_changes_with_license_score() {
1971 let text_options = crate::scanner::TextDetectionOptions::default();
1972 let default_fingerprint = scan_options_fingerprint(
1973 &text_options,
1974 LicenseScanOptions {
1975 min_score: 0,
1976 ..LicenseScanOptions::default()
1977 },
1978 None,
1979 );
1980 let filtered_fingerprint = scan_options_fingerprint(
1981 &text_options,
1982 LicenseScanOptions {
1983 min_score: 70,
1984 ..LicenseScanOptions::default()
1985 },
1986 None,
1987 );
1988
1989 assert_ne!(default_fingerprint, filtered_fingerprint);
1990 }
1991
1992 #[test]
1993 fn test_is_go_non_production_source_for_test_filename() {
1994 let temp_dir = tempdir().unwrap();
1995 let path = temp_dir.path().join("scanner_test.go");
1996 fs::write(&path, "package scanner\n").unwrap();
1997
1998 assert!(is_go_non_production_source(&path).unwrap());
1999 }
2000
2001 #[test]
2002 fn test_is_go_non_production_source_for_build_tag() {
2003 let temp_dir = tempdir().unwrap();
2004 let path = temp_dir.path().join("scanner.go");
2005 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2006
2007 assert!(is_go_non_production_source(&path).unwrap());
2008 }
2009
2010 #[test]
2011 fn test_is_go_non_production_source_for_regular_go_file() {
2012 let temp_dir = tempdir().unwrap();
2013 let path = temp_dir.path().join("scanner.go");
2014 fs::write(&path, "package scanner\n").unwrap();
2015
2016 assert!(!is_go_non_production_source(&path).unwrap());
2017 }
2018}