1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3 is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29 LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36 ExtractedTextKind, augment_license_detection_text, classify_file_info,
37 extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44 (
45 "-----BEGIN TRUSTED CERTIFICATE-----",
46 "-----END TRUSTED CERTIFICATE-----",
47 ),
48];
49
50pub fn process_collected(
51 collected: &CollectedPaths,
52 progress: Arc<ScanProgress>,
53 license_engine: Option<Arc<LicenseDetectionEngine>>,
54 license_options: LicenseScanOptions,
55 text_options: &TextDetectionOptions,
56) -> ProcessResult {
57 let mut all_files: Vec<FileInfo> = collected
58 .files
59 .par_iter()
60 .map(|(path, metadata)| {
61 let file_entry = process_file(
62 path,
63 metadata,
64 progress.as_ref(),
65 license_engine.clone(),
66 license_options,
67 text_options,
68 );
69 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70 file_entry
71 })
72 .collect();
73
74 for (path, metadata) in &collected.directories {
75 all_files.push(process_directory(
76 path,
77 metadata,
78 text_options.collect_info,
79 license_engine.is_some(),
80 ));
81 }
82
83 ProcessResult {
84 files: all_files,
85 excluded_count: collected.excluded_count,
86 }
87}
88
89pub fn process_collected_sequential(
90 collected: &CollectedPaths,
91 progress: Arc<ScanProgress>,
92 license_engine: Option<Arc<LicenseDetectionEngine>>,
93 license_options: LicenseScanOptions,
94 text_options: &TextDetectionOptions,
95) -> ProcessResult {
96 let mut all_files: Vec<FileInfo> =
97 Vec::with_capacity(collected.files.len() + collected.directories.len());
98
99 for (path, metadata) in &collected.files {
100 let file_entry = process_file(
101 path,
102 metadata,
103 progress.as_ref(),
104 license_engine.clone(),
105 license_options,
106 text_options,
107 );
108 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
109 all_files.push(file_entry);
110 }
111
112 for (path, metadata) in &collected.directories {
113 all_files.push(process_directory(
114 path,
115 metadata,
116 text_options.collect_info,
117 license_engine.is_some(),
118 ));
119 }
120
121 ProcessResult {
122 files: all_files,
123 excluded_count: collected.excluded_count,
124 }
125}
126
127pub fn process_collected_with_memory_limit(
128 collected: &CollectedPaths,
129 progress: Arc<ScanProgress>,
130 license_engine: Option<Arc<LicenseDetectionEngine>>,
131 license_options: LicenseScanOptions,
132 text_options: &TextDetectionOptions,
133 max_in_memory: i64,
134) -> ProcessResult {
135 if max_in_memory == 0 {
136 return process_collected(
137 collected,
138 progress,
139 license_engine,
140 license_options,
141 text_options,
142 );
143 }
144
145 let memory_limit = if max_in_memory < 0 {
146 0
147 } else {
148 max_in_memory as usize
149 };
150 let chunk_size = if max_in_memory < 0 {
151 256
152 } else {
153 memory_limit.max(1)
154 };
155
156 let mut retained_files = Vec::new();
157 let mut spill_store = None;
158
159 for chunk in collected.files.chunks(chunk_size) {
160 let processed_chunk: Vec<FileInfo> = chunk
161 .par_iter()
162 .map(|(path, metadata)| {
163 let file_entry = process_file(
164 path,
165 metadata,
166 progress.as_ref(),
167 license_engine.clone(),
168 license_options,
169 text_options,
170 );
171 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
172 file_entry
173 })
174 .collect();
175
176 retain_or_spill_chunk(
177 processed_chunk,
178 &mut retained_files,
179 &mut spill_store,
180 memory_limit,
181 );
182 }
183
184 for (path, metadata) in &collected.directories {
185 let entry = process_directory(
186 path,
187 metadata,
188 text_options.collect_info,
189 license_engine.is_some(),
190 );
191 retain_or_spill_chunk(
192 vec![entry],
193 &mut retained_files,
194 &mut spill_store,
195 memory_limit,
196 );
197 }
198
199 if let Some(spill_store) = spill_store {
200 retained_files.extend(spill_store.load_all());
201 }
202
203 ProcessResult {
204 files: retained_files,
205 excluded_count: collected.excluded_count,
206 }
207}
208
209pub fn process_collected_with_memory_limit_sequential(
210 collected: &CollectedPaths,
211 progress: Arc<ScanProgress>,
212 license_engine: Option<Arc<LicenseDetectionEngine>>,
213 license_options: LicenseScanOptions,
214 text_options: &TextDetectionOptions,
215 max_in_memory: i64,
216) -> ProcessResult {
217 if max_in_memory == 0 {
218 return process_collected_sequential(
219 collected,
220 progress,
221 license_engine,
222 license_options,
223 text_options,
224 );
225 }
226
227 let memory_limit = if max_in_memory < 0 {
228 0
229 } else {
230 max_in_memory as usize
231 };
232 let chunk_size = if max_in_memory < 0 {
233 256
234 } else {
235 memory_limit.max(1)
236 };
237
238 let mut retained_files = Vec::new();
239 let mut spill_store = None;
240
241 for chunk in collected.files.chunks(chunk_size) {
242 let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
243 for (path, metadata) in chunk {
244 let file_entry = process_file(
245 path,
246 metadata,
247 progress.as_ref(),
248 license_engine.clone(),
249 license_options,
250 text_options,
251 );
252 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
253 processed_chunk.push(file_entry);
254 }
255
256 retain_or_spill_chunk(
257 processed_chunk,
258 &mut retained_files,
259 &mut spill_store,
260 memory_limit,
261 );
262 }
263
264 for (path, metadata) in &collected.directories {
265 let entry = process_directory(
266 path,
267 metadata,
268 text_options.collect_info,
269 license_engine.is_some(),
270 );
271 retain_or_spill_chunk(
272 vec![entry],
273 &mut retained_files,
274 &mut spill_store,
275 memory_limit,
276 );
277 }
278
279 if let Some(spill_store) = spill_store {
280 retained_files.extend(spill_store.load_all());
281 }
282
283 ProcessResult {
284 files: retained_files,
285 excluded_count: collected.excluded_count,
286 }
287}
288
289fn retain_or_spill_chunk(
290 chunk: Vec<FileInfo>,
291 retained_files: &mut Vec<FileInfo>,
292 spill_store: &mut Option<FileInfoSpillStore>,
293 memory_limit: usize,
294) {
295 if memory_limit == 0 {
296 spill_store
297 .get_or_insert_with(FileInfoSpillStore::new)
298 .spill(chunk);
299 return;
300 }
301
302 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
303 if remaining_capacity >= chunk.len() && spill_store.is_none() {
304 retained_files.extend(chunk);
305 return;
306 }
307
308 let mut chunk_iter = chunk.into_iter();
309 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
310 let overflow: Vec<FileInfo> = chunk_iter.collect();
311 if !overflow.is_empty() {
312 spill_store
313 .get_or_insert_with(FileInfoSpillStore::new)
314 .spill(overflow);
315 }
316}
317
318struct FileInfoSpillStore {
319 temp_dir: TempDir,
320 batch_index: usize,
321}
322
323impl FileInfoSpillStore {
324 fn new() -> Self {
325 Self {
326 temp_dir: TempDir::new().expect("create spill dir"),
327 batch_index: 0,
328 }
329 }
330
331 fn spill(&mut self, files: Vec<FileInfo>) {
332 let path = self
333 .temp_dir
334 .path()
335 .join(format!("batch-{:06}.json.zst", self.batch_index));
336 self.batch_index += 1;
337
338 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
339 let file = File::create(path).expect("create spill batch file");
340 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
341 encoder
342 .write_all(&payload)
343 .expect("write spilled file batch");
344 encoder.finish().expect("finish spill encoder");
345 }
346
347 fn load_all(self) -> Vec<FileInfo> {
348 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
349 .expect("read spill dir")
350 .filter_map(Result::ok)
351 .map(|entry| entry.path())
352 .collect();
353 paths.sort();
354
355 let mut files = Vec::new();
356 for path in paths {
357 let file = File::open(path).expect("open spill batch");
358 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
359 let mut payload = Vec::new();
360 decoder.read_to_end(&mut payload).expect("read spill batch");
361 let mut batch: Vec<FileInfo> =
362 serde_json::from_slice(&payload).expect("decode spilled file batch");
363 files.append(&mut batch);
364 }
365 files
366 }
367}
368
369fn process_file(
370 path: &Path,
371 metadata: &fs::Metadata,
372 progress: &ScanProgress,
373 license_engine: Option<Arc<LicenseDetectionEngine>>,
374 license_options: LicenseScanOptions,
375 text_options: &TextDetectionOptions,
376) -> FileInfo {
377 let mut scan_errors: Vec<String> = vec![];
378 let mut file_info_builder = FileInfoBuilder::default();
379 let license_enabled = license_engine.is_some();
380
381 let started = Instant::now();
382
383 let mut generated_flag = None;
384 let mut is_source_file = false;
385 match extract_information_from_content(
386 &mut file_info_builder,
387 &mut scan_errors,
388 path,
389 progress,
390 license_engine,
391 license_options,
392 text_options,
393 ) {
394 Ok((is_generated, sha256, is_source)) => {
395 generated_flag = is_generated;
396 is_source_file = is_source;
397 let _ = sha256;
398 }
399 Err(e) => scan_errors.push(e.to_string()),
400 };
401
402 maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
403
404 let mut file_info = file_info_builder
405 .name(path.file_name().unwrap().to_string_lossy().to_string())
406 .base_name(
407 path.file_stem()
408 .unwrap_or_default()
409 .to_string_lossy()
410 .to_string(),
411 )
412 .extension(
413 path.extension()
414 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
415 )
416 .path(path.to_string_lossy().to_string())
417 .file_type(FileType::File)
418 .size(metadata.len())
419 .date(
420 text_options
421 .collect_info
422 .then(|| get_creation_date(metadata))
423 .flatten(),
424 )
425 .scan_errors(scan_errors)
426 .build()
427 .expect("FileInformationBuild not completely initialized");
428
429 if text_options.collect_info {
430 file_info.is_source = Some(is_source_file);
431 }
432
433 if file_info.programming_language.as_deref() == Some("Go")
434 && is_go_non_production_source(path).unwrap_or(false)
435 {
436 file_info.is_source = Some(false);
437 }
438
439 if text_options.detect_generated {
440 file_info.is_generated = Some(generated_flag.unwrap_or(false));
441 }
442
443 if file_info.percentage_of_license_text.is_none() && license_enabled {
444 file_info.percentage_of_license_text = Some(0.0);
445 }
446
447 file_info
448}
449
450fn extract_information_from_content(
451 file_info_builder: &mut FileInfoBuilder,
452 scan_errors: &mut Vec<String>,
453 path: &Path,
454 progress: &ScanProgress,
455 license_engine: Option<Arc<LicenseDetectionEngine>>,
456 license_options: LicenseScanOptions,
457 text_options: &TextDetectionOptions,
458) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
459 let started = Instant::now();
460 let filesystem_path = absolute_filesystem_path(path);
461 let buffer = fs::read(&filesystem_path)?;
462 let license_enabled = license_engine.is_some();
463
464 if is_timeout_exceeded(started, text_options.timeout_seconds) {
465 return Err(Error::msg(format!(
466 "Timeout while reading file content (> {:.2}s)",
467 text_options.timeout_seconds
468 )));
469 }
470
471 let sha256 = calculate_sha256(&buffer);
472 let is_generated = text_options
473 .detect_generated
474 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
475 let classification = classify_file_info(&filesystem_path, &buffer);
476
477 if text_options.collect_info {
478 file_info_builder
479 .sha1(Some(calculate_sha1(&buffer)))
480 .md5(Some(calculate_md5(&buffer)))
481 .sha256(Some(sha256))
482 .programming_language(classification.programming_language.clone())
483 .mime_type(Some(classification.mime_type.clone()))
484 .file_type_label(Some(classification.file_type.clone()))
485 .sha1_git(Some(calculate_sha1_git(&buffer)))
486 .is_binary(Some(classification.is_binary))
487 .is_text(Some(classification.is_text))
488 .is_archive(Some(classification.is_archive))
489 .is_media(Some(classification.is_media))
490 .is_source(Some(classification.is_source))
491 .is_script(Some(classification.is_script))
492 .files_count(Some(0))
493 .dirs_count(Some(0))
494 .size_count(Some(0));
495 }
496
497 if should_skip_text_detection(&filesystem_path, &buffer) {
498 return Ok((is_generated, sha256, classification.is_source));
499 }
500
501 if text_options.detect_packages {
504 let started = Instant::now();
505 let parse_result = try_parse_file(&filesystem_path)
506 .or_else(|| {
507 text_options
508 .detect_application_packages
509 .then(|| try_parse_windows_executable_bytes(&filesystem_path, &buffer))
510 .flatten()
511 })
512 .or_else(|| {
513 text_options
514 .detect_packages_in_compiled
515 .then(|| {
516 (classification.is_binary && is_supported_compiled_binary_format(&buffer))
517 .then(|| try_parse_compiled_bytes(&buffer))
518 .flatten()
519 })
520 .flatten()
521 });
522
523 if let Some(parse_result) = parse_result {
524 let packages = parse_result
525 .packages
526 .into_iter()
527 .filter(|package| {
528 let is_compiled_package = package
529 .datasource_id
530 .as_ref()
531 .is_some_and(is_compiled_datasource);
532 let is_system_package = package
533 .datasource_id
534 .as_ref()
535 .is_some_and(is_system_datasource);
536 if is_compiled_package {
537 text_options.detect_packages_in_compiled
538 } else if is_system_package {
539 text_options.detect_system_packages
540 } else {
541 text_options.detect_application_packages
542 }
543 })
544 .collect();
545 file_info_builder.package_data(packages);
546 scan_errors.extend(parse_result.scan_errors);
547 }
548 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
549 }
550
551 if is_timeout_exceeded(started, text_options.timeout_seconds) {
552 return Err(Error::msg(format!(
553 "Timeout while extracting package/text metadata (> {:.2}s)",
554 text_options.timeout_seconds
555 )));
556 }
557
558 let (text_content, text_kind, text_scan_error) =
559 extract_text_for_detection_with_diagnostics(&filesystem_path, &buffer);
560 if let Some(text_scan_error) = text_scan_error {
561 scan_errors.push(text_scan_error);
562 }
563 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
564
565 if is_timeout_exceeded(started, text_options.timeout_seconds) {
566 return Err(Error::msg(format!(
567 "Timeout while extracting text content (> {:.2}s)",
568 text_options.timeout_seconds
569 )));
570 }
571
572 if text_content.is_empty() {
573 return Ok((is_generated, sha256, classification.is_source));
574 }
575
576 if text_options.detect_copyrights {
577 extract_copyright_information(
578 file_info_builder,
579 path,
580 &text_content,
581 text_options.timeout_seconds,
582 from_binary_strings,
583 );
584 }
585 extract_email_url_information(
586 file_info_builder,
587 &text_content,
588 text_options,
589 from_binary_strings,
590 );
591
592 if is_timeout_exceeded(started, text_options.timeout_seconds) {
593 return Err(Error::msg(format!(
594 "Timeout before license scan (> {:.2}s)",
595 text_options.timeout_seconds
596 )));
597 }
598 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
600 if let Some(sourcemap_content) =
601 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
602 {
603 sourcemap_content
604 } else {
605 text_content
606 }
607 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
608 remove_verbatim_escape_sequences(&text_content)
609 } else {
610 text_content
611 };
612 let text_content_for_license_detection =
613 augment_license_detection_text(path, &text_content_for_license_detection);
614 let text_content_for_license_detection = text_content_for_license_detection.into_owned();
615
616 if license_enabled {
617 let started = Instant::now();
618 extract_license_information(
619 file_info_builder,
620 scan_errors,
621 &filesystem_path,
622 text_content_for_license_detection.clone(),
623 license_engine,
624 license_options,
625 from_binary_strings,
626 )?;
627 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
628 } else {
629 extract_license_information(
630 file_info_builder,
631 scan_errors,
632 &filesystem_path,
633 text_content_for_license_detection,
634 license_engine,
635 license_options,
636 from_binary_strings,
637 )?;
638 }
639
640 Ok((is_generated, sha256, classification.is_source))
641}
642
643fn absolute_filesystem_path(path: &Path) -> std::path::PathBuf {
644 if path.is_absolute() {
645 return path.to_path_buf();
646 }
647
648 std::env::current_dir()
649 .map(|cwd| cwd.join(path))
650 .unwrap_or_else(|_| path.to_path_buf())
651}
652
653fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
654 timeout_seconds.is_finite()
655 && timeout_seconds > 0.0
656 && started.elapsed().as_secs_f64() > timeout_seconds
657}
658
659fn maybe_record_processing_timeout(
660 scan_errors: &mut Vec<String>,
661 started: Instant,
662 timeout_seconds: f64,
663) {
664 if is_timeout_exceeded(started, timeout_seconds)
665 && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
666 {
667 scan_errors.push(format!(
668 "Processing interrupted due to timeout after {:.2} seconds",
669 timeout_seconds
670 ));
671 }
672}
673
674fn is_timeout_scan_error(error: &str) -> bool {
675 error.contains("Timeout while ")
676 || error.contains("Timeout before ")
677 || error.contains("Processing interrupted due to timeout")
678}
679
680fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
681 matches!(
682 datasource_id,
683 DatasourceId::AlpineInstalledDb
684 | DatasourceId::DebianDistrolessInstalledDb
685 | DatasourceId::DebianInstalledFilesList
686 | DatasourceId::DebianInstalledMd5Sums
687 | DatasourceId::DebianInstalledStatusDb
688 | DatasourceId::FreebsdCompactManifest
689 | DatasourceId::RpmInstalledDatabaseBdb
690 | DatasourceId::RpmInstalledDatabaseNdb
691 | DatasourceId::RpmInstalledDatabaseSqlite
692 | DatasourceId::RpmYumdb
693 )
694}
695
696fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
697 matches!(
698 datasource_id,
699 DatasourceId::GoBinary | DatasourceId::RustBinary
700 )
701}
702
703fn extract_copyright_information(
704 file_info_builder: &mut FileInfoBuilder,
705 path: &Path,
706 text_content: &str,
707 timeout_seconds: f64,
708 from_binary_strings: bool,
709) {
710 if copyright::is_credits_file(path) {
712 let author_detections = copyright::detect_credits_authors(text_content);
713 if !author_detections.is_empty() {
714 file_info_builder.authors(
715 author_detections
716 .into_iter()
717 .map(|a| Author {
718 author: a.author,
719 start_line: a.start_line,
720 end_line: a.end_line,
721 })
722 .collect(),
723 );
724 return;
725 }
726 }
727
728 let copyright_options = CopyrightDetectionOptions {
729 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
730 Some(Duration::from_secs_f64(timeout_seconds))
731 } else {
732 None
733 },
734 ..CopyrightDetectionOptions::default()
735 };
736
737 let (copyrights, holders, authors) =
738 copyright::detect_copyrights_with_options(text_content, ©right_options);
739 let (copyrights, holders, authors) = if from_binary_strings {
740 prune_binary_string_detections(text_content, copyrights, holders, authors)
741 } else {
742 (copyrights, holders, authors)
743 };
744
745 file_info_builder.copyrights(
746 copyrights
747 .into_iter()
748 .map(|c| Copyright {
749 copyright: c.copyright,
750 start_line: c.start_line,
751 end_line: c.end_line,
752 })
753 .collect::<Vec<Copyright>>(),
754 );
755 file_info_builder.holders(
756 holders
757 .into_iter()
758 .map(|h| Holder {
759 holder: h.holder,
760 start_line: h.start_line,
761 end_line: h.end_line,
762 })
763 .collect::<Vec<Holder>>(),
764 );
765 file_info_builder.authors(
766 authors
767 .into_iter()
768 .map(|a| Author {
769 author: a.author,
770 start_line: a.start_line,
771 end_line: a.end_line,
772 })
773 .collect::<Vec<Author>>(),
774 );
775}
776
777fn prune_binary_string_detections(
778 text_content: &str,
779 copyrights: Vec<CopyrightDetection>,
780 holders: Vec<HolderDetection>,
781 authors: Vec<AuthorDetection>,
782) -> (
783 Vec<CopyrightDetection>,
784 Vec<HolderDetection>,
785 Vec<AuthorDetection>,
786) {
787 let kept_copyrights: Vec<CopyrightDetection> = copyrights
788 .into_iter()
789 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
790 .collect();
791
792 let kept_holders: Vec<HolderDetection> = holders
793 .into_iter()
794 .filter(|holder| {
795 kept_copyrights.iter().any(|copyright| {
796 ranges_overlap(
797 holder.start_line,
798 holder.end_line,
799 copyright.start_line,
800 copyright.end_line,
801 )
802 })
803 })
804 .collect();
805
806 let kept_authors = authors
807 .into_iter()
808 .filter(|author| is_binary_string_author_candidate(&author.author))
809 .chain(extract_binary_string_author_supplements(text_content))
810 .filter({
811 let mut seen = HashSet::new();
812 move |author| seen.insert(author.author.clone())
813 })
814 .collect();
815
816 (kept_copyrights, kept_holders, kept_authors)
817}
818
819fn ranges_overlap(
820 a_start: LineNumber,
821 a_end: LineNumber,
822 b_start: LineNumber,
823 b_end: LineNumber,
824) -> bool {
825 a_start <= b_end && b_start <= a_end
826}
827
828fn is_binary_string_copyright_candidate(text: &str) -> bool {
829 if contains_year(text) {
830 return true;
831 }
832
833 let trimmed = text.trim();
834 let lower = trimmed.to_ascii_lowercase();
835 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
836 tail.trim()
837 } else {
838 lower.trim()
839 };
840 let original_tail = if lower.starts_with("copyright") {
841 trimmed["copyright".len()..].trim()
842 } else {
843 trimmed
844 };
845
846 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
847 return false;
848 }
849
850 let alpha_tokens: Vec<&str> = tail
851 .split_whitespace()
852 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
853 .collect();
854
855 if alpha_tokens.len() <= 1 {
856 return has_explicit_copyright_marker(text)
857 && alpha_tokens.iter().any(|token| {
858 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
859 });
860 }
861
862 if !has_explicit_copyright_marker(text) {
863 return false;
864 }
865
866 has_binary_name_like_shape(original_tail)
867}
868
869fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
870 let mut authors = Vec::new();
871
872 for (line_index, line) in text_content.lines().enumerate() {
873 if let Some(author) = extract_named_author_from_binary_line(line) {
874 authors.push(AuthorDetection {
875 author,
876 start_line: LineNumber::from_0_indexed(line_index),
877 end_line: LineNumber::from_0_indexed(line_index),
878 });
879 }
880 }
881
882 authors
883}
884
885fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
886 let line = line.trim();
887 if line.is_empty() {
888 return None;
889 }
890
891 let emails = finder::find_emails(
892 line,
893 &DetectionConfig {
894 max_emails: 4,
895 max_urls: 0,
896 unique: false,
897 },
898 );
899 let email = emails.first()?.email.as_str();
900 if !is_binary_string_email_candidate(email) {
901 return None;
902 }
903
904 let lower_line = line.to_ascii_lowercase();
905 let email_start = lower_line.find(email)?;
906 let raw_prefix = &line[..email_start];
907 let has_author_marker = contains_binary_author_marker(raw_prefix);
908 let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
909 let prefix = prefix
910 .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
911 .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
912 .trim();
913
914 let (name, _) = split_name_email(prefix);
915 let name = name.or_else(|| {
916 let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
917 (!trimmed.is_empty()).then(|| trimmed.to_string())
918 });
919
920 let Some(name) = name.map(|name| name.trim().to_string()) else {
921 if has_author_marker {
922 return Some(email.to_string());
923 }
924 return None;
925 };
926
927 if name.is_empty() && has_author_marker {
928 return Some(email.to_string());
929 }
930
931 if !has_binary_name_like_shape(&name) {
932 return None;
933 }
934
935 if line.contains(&format!("<{email}>")) {
936 Some(format!("{name} <{email}>"))
937 } else if line.contains(&format!("({email})")) {
938 Some(format!("{name} ({email})"))
939 } else {
940 Some(format!("{name} {email}"))
941 }
942}
943
944fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
945 let lower = text.to_ascii_lowercase();
946 let idx = lower.rfind(marker)?;
947 Some(text[idx + marker.len()..].trim())
948}
949
950fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
951 const MARKERS: &[&str] = &[
952 " patch author: ",
953 " patch author ",
954 " written by ",
955 " contributed by ",
956 " original work done by ",
957 " work done by ",
958 " thanks to ",
959 " review by ",
960 " by ",
961 " from ",
962 ];
963
964 MARKERS
965 .iter()
966 .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
967 .next()
968}
969
970fn contains_binary_author_marker(text: &str) -> bool {
971 take_suffix_after_last_author_marker(text).is_some()
972}
973
974fn has_binary_name_like_shape(text: &str) -> bool {
975 let trimmed = text.trim();
976 if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
977 {
978 return false;
979 }
980
981 let tokens: Vec<&str> = trimmed
982 .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
983 .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
984 .collect();
985 if tokens.is_empty() {
986 return false;
987 }
988
989 let uppercase_like = tokens
990 .iter()
991 .filter(|token| {
992 let token = token.trim_matches('.');
993 token
994 .chars()
995 .find(|c| c.is_ascii_alphabetic())
996 .is_some_and(|c| c.is_ascii_uppercase())
997 })
998 .count();
999
1000 uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
1001 || tokens
1002 .iter()
1003 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
1004}
1005
1006fn has_sufficient_alphabetic_content(text: &str) -> bool {
1007 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
1008 if alnum_count == 0 {
1009 return false;
1010 }
1011
1012 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1013 alpha_count * 2 >= alnum_count
1014}
1015
1016fn has_excessive_at_noise(text: &str) -> bool {
1017 text.chars().filter(|c| *c == '@').count() >= 3
1018}
1019
1020fn has_explicit_copyright_marker(text: &str) -> bool {
1021 let lower = text.to_ascii_lowercase();
1022 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1023}
1024
1025fn contains_year(text: &str) -> bool {
1026 let bytes = text.as_bytes();
1027 bytes.windows(4).any(|window| {
1028 window.iter().all(|b| b.is_ascii_digit())
1029 && matches!(window[0], b'1' | b'2')
1030 && matches!(window[1], b'9' | b'0')
1031 })
1032}
1033
1034fn is_company_like_suffix(token: &str) -> bool {
1035 matches!(
1036 token.to_ascii_lowercase().as_str(),
1037 "inc"
1038 | "corp"
1039 | "corporation"
1040 | "co"
1041 | "company"
1042 | "ltd"
1043 | "llc"
1044 | "gmbh"
1045 | "foundation"
1046 | "project"
1047 | "systems"
1048 | "software"
1049 | "technologies"
1050 | "technology"
1051 )
1052}
1053
1054fn extract_email_url_information(
1055 file_info_builder: &mut FileInfoBuilder,
1056 text_content: &str,
1057 text_options: &TextDetectionOptions,
1058 from_binary_strings: bool,
1059) {
1060 if !text_options.detect_emails && !text_options.detect_urls {
1061 return;
1062 }
1063
1064 if text_options.detect_emails {
1065 let config = DetectionConfig {
1066 max_emails: text_options.max_emails,
1067 max_urls: text_options.max_urls,
1068 unique: from_binary_strings,
1069 };
1070 let emails = finder::find_emails(text_content, &config)
1071 .into_iter()
1072 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1073 .map(|d| OutputEmail {
1074 email: d.email,
1075 start_line: d.start_line,
1076 end_line: d.end_line,
1077 })
1078 .collect::<Vec<_>>();
1079 file_info_builder.emails(emails);
1080 }
1081
1082 if text_options.detect_urls {
1083 let config = DetectionConfig {
1084 max_emails: text_options.max_emails,
1085 max_urls: text_options.max_urls,
1086 unique: true,
1087 };
1088 let urls = finder::find_urls(text_content, &config)
1089 .into_iter()
1090 .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
1091 .map(|d| OutputURL {
1092 url: d.url,
1093 start_line: d.start_line,
1094 end_line: d.end_line,
1095 })
1096 .collect::<Vec<_>>();
1097 file_info_builder.urls(urls);
1098 }
1099}
1100
1101fn is_binary_string_email_candidate(email: &str) -> bool {
1102 let Some((local, domain)) = email.rsplit_once('@') else {
1103 return false;
1104 };
1105
1106 if !has_strong_binary_local_part(local) {
1107 return false;
1108 }
1109
1110 has_strong_binary_host_shape(domain)
1111}
1112
1113fn is_binary_string_url_candidate(url: &str) -> bool {
1114 let parsed = url::Url::parse(url).ok();
1115 let Some(parsed) = parsed else {
1116 return false;
1117 };
1118 let Some(host) = parsed.host_str() else {
1119 return false;
1120 };
1121
1122 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1123}
1124
1125fn is_binary_string_author_candidate(author: &str) -> bool {
1126 let trimmed = author.trim();
1127 if trimmed.is_empty()
1128 || !has_sufficient_alphabetic_content(trimmed)
1129 || has_excessive_at_noise(trimmed)
1130 {
1131 return false;
1132 }
1133
1134 if trimmed.contains('@') {
1135 let emails = finder::find_emails(
1136 trimmed,
1137 &DetectionConfig {
1138 max_emails: 4,
1139 max_urls: 0,
1140 unique: true,
1141 },
1142 );
1143 if emails.len() > 1 {
1144 return false;
1145 }
1146
1147 if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1148 return !extracted.is_empty();
1149 }
1150
1151 let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1152 return false;
1153 };
1154 if !is_binary_string_email_candidate(email) {
1155 return false;
1156 }
1157
1158 let (name, _) = split_name_email(trimmed);
1159 return name.as_deref().is_some_and(has_binary_name_like_shape);
1160 }
1161
1162 has_binary_name_like_shape(trimmed)
1163}
1164
1165fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1166 if parsed.path() != "/"
1167 && parsed
1168 .path()
1169 .split('/')
1170 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1171 {
1172 return true;
1173 }
1174
1175 if parsed.query().is_some() || parsed.fragment().is_some() {
1176 return true;
1177 }
1178
1179 let Some(host) = parsed.host_str() else {
1180 return false;
1181 };
1182
1183 let labels: Vec<&str> = host.split('.').collect();
1184 if labels.len() > 2 {
1185 return labels[..labels.len() - 1].iter().any(|label| {
1186 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1187 });
1188 }
1189
1190 if matches!(labels.first(), Some(&"www")) {
1191 return true;
1192 }
1193
1194 if labels.len() == 2 {
1195 let domain = labels[0];
1196 let tld = labels[1];
1197 if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1198 return true;
1199 }
1200 }
1201
1202 labels
1203 .iter()
1204 .take(labels.len().saturating_sub(1))
1205 .any(|label| {
1206 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1207 })
1208}
1209
1210fn has_strong_binary_local_part(local: &str) -> bool {
1211 local
1212 .split(|c: char| !c.is_ascii_alphabetic())
1213 .any(|segment| segment.len() >= 3)
1214}
1215
1216fn has_strong_binary_host_shape(host: &str) -> bool {
1217 let labels: Vec<&str> = host.split('.').collect();
1218 if labels.len() < 2 {
1219 return false;
1220 }
1221
1222 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1223 &labels[1..]
1224 } else {
1225 &labels[..]
1226 };
1227
1228 if relevant.len() < 2 {
1229 return false;
1230 }
1231
1232 relevant[..relevant.len() - 1].iter().any(|label| {
1233 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1234 })
1235}
1236
1237fn extract_license_information(
1238 file_info_builder: &mut FileInfoBuilder,
1239 scan_errors: &mut Vec<String>,
1240 path: &Path,
1241 text_content: String,
1242 license_engine: Option<Arc<LicenseDetectionEngine>>,
1243 license_options: LicenseScanOptions,
1244 from_binary_strings: bool,
1245) -> Result<(), Error> {
1246 let Some(engine) = license_engine else {
1247 return Ok(());
1248 };
1249
1250 let detection_result = if license_options.min_score == 0 {
1251 engine.detect_with_kind_and_source(
1252 &text_content,
1253 license_options.unknown_licenses,
1254 from_binary_strings,
1255 &path.to_string_lossy(),
1256 )
1257 } else {
1258 engine.detect_with_kind_and_source_with_score(
1259 &text_content,
1260 license_options.unknown_licenses,
1261 from_binary_strings,
1262 &path.to_string_lossy(),
1263 license_options.min_score as f32,
1264 )
1265 };
1266
1267 match detection_result {
1268 Ok(detections) => {
1269 let query =
1270 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1271 let mut model_detections = Vec::new();
1272 let mut model_clues = Vec::new();
1273
1274 for detection in &detections {
1275 let (public_detection, clue_matches) = convert_detection_to_model(
1276 detection,
1277 license_options,
1278 &text_content,
1279 query.as_ref(),
1280 );
1281
1282 if let Some(public_detection) = public_detection {
1283 model_detections.push(public_detection);
1284 }
1285
1286 model_clues.extend(clue_matches);
1287 }
1288
1289 if !model_detections.is_empty() {
1290 let expressions: Vec<String> = model_detections
1291 .iter()
1292 .filter(|d| !d.license_expression_spdx.is_empty())
1293 .map(|d| d.license_expression_spdx.clone())
1294 .collect();
1295
1296 if !expressions.is_empty() {
1297 let combined = crate::utils::spdx::combine_license_expressions(expressions);
1298 if let Some(expr) = combined {
1299 file_info_builder.license_expression(Some(expr));
1300 }
1301 }
1302 }
1303
1304 file_info_builder.license_detections(model_detections);
1305 file_info_builder.license_clues(model_clues);
1306 file_info_builder.percentage_of_license_text(
1307 query
1308 .as_ref()
1309 .map(|query| compute_percentage_of_license_text(query, &detections)),
1310 );
1311 }
1312 Err(e) => {
1313 scan_errors.push(format!("License detection failed: {}", e));
1314 }
1315 }
1316
1317 Ok(())
1318}
1319
1320fn convert_detection_to_model(
1321 detection: &crate::license_detection::LicenseDetection,
1322 license_options: LicenseScanOptions,
1323 text_content: &str,
1324 query: Option<&Query<'_>>,
1325) -> (Option<LicenseDetection>, Vec<Match>) {
1326 let matches: Vec<Match> = detection
1327 .matches
1328 .iter()
1329 .map(|m| convert_match_to_model(m, license_options, text_content, query))
1330 .collect();
1331
1332 if let Some(license_expression) = detection.license_expression.clone() {
1333 (
1334 Some(LicenseDetection {
1335 license_expression,
1336 license_expression_spdx: detection
1337 .license_expression_spdx
1338 .clone()
1339 .unwrap_or_default(),
1340 matches,
1341 detection_log: if license_options.include_diagnostics {
1342 detection.detection_log.clone()
1343 } else {
1344 Vec::new()
1345 },
1346 identifier: detection.identifier.clone(),
1347 }),
1348 Vec::new(),
1349 )
1350 } else {
1351 (None, matches)
1352 }
1353}
1354
1355fn convert_match_to_model(
1356 m: &crate::license_detection::models::LicenseMatch,
1357 license_options: LicenseScanOptions,
1358 text_content: &str,
1359 query: Option<&Query<'_>>,
1360) -> Match {
1361 let rule_url = if m.rule_url.is_empty() {
1362 None
1363 } else {
1364 Some(m.rule_url.clone())
1365 };
1366 let matched_text = if license_options.include_text {
1367 m.matched_text.clone().or_else(|| {
1368 Some(crate::license_detection::query::matched_text_from_text(
1369 text_content,
1370 m.start_line.get(),
1371 m.end_line.get(),
1372 ))
1373 })
1374 } else {
1375 None
1376 };
1377 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1378 query.map(|query| matched_text_diagnostics_from_match(query, m))
1379 } else {
1380 None
1381 };
1382 Match {
1383 license_expression: m.license_expression.clone(),
1384 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1385 from_file: m.from_file.clone(),
1386 start_line: m.start_line,
1387 end_line: m.end_line,
1388 matcher: Some(m.matcher.to_string()),
1389 score: m.score,
1390 matched_length: Some(m.matched_length),
1391 match_coverage: Some(((m.coverage() as f64) * 100.0).round() / 100.0),
1392 rule_relevance: Some(m.rule_relevance),
1393 rule_identifier: Some(m.rule_identifier.clone()),
1394 rule_url,
1395 matched_text,
1396 referenced_filenames: m.referenced_filenames.clone(),
1397 matched_text_diagnostics,
1398 }
1399}
1400
1401fn compute_percentage_of_license_text(
1402 query: &Query<'_>,
1403 detections: &[crate::license_detection::LicenseDetection],
1404) -> f64 {
1405 let matched_positions: std::collections::HashSet<usize> = detections
1406 .iter()
1407 .flat_map(|detection| detection.matches.iter())
1408 .flat_map(|m| m.query_span().iter())
1409 .collect();
1410
1411 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1412 if query_tokens_length == 0 {
1413 return 0.0;
1414 }
1415
1416 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1417 (percentage * 100.0).round() / 100.0
1418}
1419
1420fn matched_text_diagnostics_from_match(
1421 query: &Query<'_>,
1422 license_match: &InternalLicenseMatch,
1423) -> String {
1424 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1425 let Some(start_pos) = matched_positions.iter().min() else {
1426 return crate::license_detection::query::matched_text_from_text(
1427 &query.text,
1428 license_match.start_line.get(),
1429 license_match.end_line.get(),
1430 );
1431 };
1432 let Some(end_pos) = matched_positions.iter().max() else {
1433 return crate::license_detection::query::matched_text_from_text(
1434 &query.text,
1435 license_match.start_line.get(),
1436 license_match.end_line.get(),
1437 );
1438 };
1439
1440 crate::license_detection::query::matched_text_diagnostics_from_text(
1441 &query.text,
1442 query,
1443 &matched_positions,
1444 start_pos,
1445 end_pos,
1446 license_match.start_line.get(),
1447 license_match.end_line.get(),
1448 )
1449}
1450
1451fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1452 is_pem_certificate_file(path, buffer)
1453}
1454
1455fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1456 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1457 return Ok(false);
1458 }
1459
1460 if path
1461 .file_name()
1462 .and_then(|name| name.to_str())
1463 .is_some_and(|name| name.ends_with("_test.go"))
1464 {
1465 return Ok(true);
1466 }
1467
1468 let content = fs::read_to_string(path)?;
1469 Ok(content.lines().take(10).any(|line| {
1470 let trimmed = line.trim();
1471 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1472 && trimmed.split_whitespace().any(|token| token == "test")
1473 }))
1474}
1475
1476fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1477 let prefix_len = buffer.len().min(8192);
1478 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1479 let trimmed_lines: Vec<&str> = prefix
1480 .lines()
1481 .map(str::trim)
1482 .filter(|line| !line.is_empty())
1483 .take(64)
1484 .collect();
1485
1486 let Some(first_line) = trimmed_lines.first().copied() else {
1487 return false;
1488 };
1489
1490 PEM_CERTIFICATE_HEADERS
1491 .iter()
1492 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1493}
1494
1495fn process_directory(
1496 path: &Path,
1497 _metadata: &fs::Metadata,
1498 collect_info: bool,
1499 license_enabled: bool,
1500) -> FileInfo {
1501 let name = path
1502 .file_name()
1503 .unwrap_or_default()
1504 .to_string_lossy()
1505 .to_string();
1506 let base_name = name.clone(); FileInfo {
1509 name,
1510 base_name,
1511 extension: "".to_string(),
1512 path: path.to_string_lossy().to_string(),
1513 file_type: FileType::Directory,
1514 mime_type: None,
1515 file_type_label: None,
1516 size: 0,
1517 date: None,
1518 sha1: None,
1519 md5: None,
1520 sha256: None,
1521 sha1_git: None,
1522 programming_language: None,
1523 package_data: Vec::new(),
1524 license_expression: None,
1525 license_detections: Vec::new(),
1526 license_clues: Vec::new(),
1527 percentage_of_license_text: license_enabled.then_some(0.0),
1528 copyrights: Vec::new(),
1529 holders: Vec::new(),
1530 authors: Vec::new(),
1531 emails: Vec::new(),
1532 urls: Vec::new(),
1533 for_packages: Vec::new(),
1534 scan_errors: Vec::new(),
1535 license_policy: None,
1536 is_binary: collect_info.then_some(false),
1537 is_text: collect_info.then_some(false),
1538 is_archive: collect_info.then_some(false),
1539 is_media: collect_info.then_some(false),
1540 is_source: collect_info.then_some(false),
1541 is_script: collect_info.then_some(false),
1542 files_count: collect_info.then_some(0),
1543 dirs_count: collect_info.then_some(0),
1544 size_count: collect_info.then_some(0),
1545 source_count: None,
1546 is_legal: false,
1547 is_manifest: false,
1548 is_readme: false,
1549 is_top_level: false,
1550 is_key_file: false,
1551 is_community: false,
1552 is_generated: None,
1553 facets: vec![],
1554 tallies: None,
1555 }
1556}
1557
1558#[cfg(test)]
1559mod tests {
1560 use super::{
1561 compute_percentage_of_license_text, convert_detection_to_model,
1562 extract_email_url_information, extract_named_author_from_binary_line,
1563 is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1564 is_binary_string_email_candidate, is_binary_string_url_candidate,
1565 is_go_non_production_source, process_file,
1566 };
1567 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1568 use crate::license_detection::index::LicenseIndex;
1569 use crate::license_detection::index::dictionary::TokenDictionary;
1570 use crate::license_detection::models::position_span::PositionSpan;
1571 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1572 use crate::license_detection::query::Query;
1573 use crate::models::{FileInfoBuilder, FileType, MatchScore};
1574 use crate::progress::{ProgressMode, ScanProgress};
1575 use crate::scanner::scan_options_fingerprint;
1576 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1577 use std::fs;
1578 use std::time::{Duration, Instant};
1579 use tempfile::tempdir;
1580
1581 use super::maybe_record_processing_timeout;
1582
1583 use crate::models::LineNumber;
1584
1585 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1586 LicenseMatch {
1587 rid: 0,
1588 license_expression: "mit".to_string(),
1589 license_expression_spdx: Some("MIT".to_string()),
1590 from_file: None,
1591 start_line: LineNumber::ONE,
1592 end_line: LineNumber::ONE,
1593 start_token: 0,
1594 end_token: 1,
1595 matcher: MatcherKind::Hash,
1596 score: MatchScore::from_percentage(1.0),
1597 matched_length: 3,
1598 rule_length: 3,
1599 match_coverage: 100.0,
1600 rule_relevance: 100,
1601 rule_identifier: "mit.LICENSE".to_string(),
1602 rule_url: rule_url.to_string(),
1603 matched_text: Some("MIT".to_string()),
1604 referenced_filenames: None,
1605 rule_kind: RuleKind::Text,
1606 is_from_license: true,
1607 rule_start_token: 0,
1608 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1609 candidate_resemblance: 0.0,
1610 candidate_containment: 0.0,
1611 }
1612 }
1613
1614 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1615 InternalLicenseDetection {
1616 license_expression: Some("mit".to_string()),
1617 license_expression_spdx: Some("MIT".to_string()),
1618 matches: vec![make_internal_match(rule_url)],
1619 detection_log: vec![],
1620 identifier: Some("mit-test".to_string()),
1621 file_regions: Vec::new(),
1622 }
1623 }
1624
1625 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1626 let dictionary = TokenDictionary::new_with_legalese(entries);
1627 let mut index = LicenseIndex::new(dictionary);
1628 index.len_legalese = len_legalese;
1629 index
1630 }
1631
1632 #[test]
1633 fn test_convert_detection_to_model_preserves_rule_url() {
1634 let detection = make_detection(
1635 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1636 );
1637
1638 let (converted, clues) =
1639 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1640 let converted = converted.expect("detection should convert");
1641
1642 assert_eq!(
1643 converted.matches[0].rule_url.as_deref(),
1644 Some(
1645 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1646 )
1647 );
1648 assert!(clues.is_empty());
1649 }
1650
1651 #[test]
1652 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1653 let detection = make_detection("");
1654
1655 let (converted, clues) =
1656 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1657 let converted = converted.expect("detection should convert");
1658
1659 assert_eq!(converted.matches[0].rule_url, None);
1660 assert!(clues.is_empty());
1661 }
1662
1663 #[test]
1664 fn test_convert_detection_to_model_rounds_match_coverage() {
1665 let mut detection = make_detection("");
1666 detection.matches[0].score = MatchScore::from_percentage(81.82);
1667 detection.matches[0].match_coverage = 33.334;
1668
1669 let (converted, clues) =
1670 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1671 let converted = converted.expect("detection should convert");
1672
1673 assert_eq!(
1674 converted.matches[0].score,
1675 MatchScore::from_percentage(81.82)
1676 );
1677 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1678 assert!(clues.is_empty());
1679 }
1680
1681 #[test]
1682 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1683 let mut detection = make_detection(
1684 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1685 );
1686 detection.license_expression = None;
1687 detection.license_expression_spdx = None;
1688 detection.identifier = None;
1689 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1690 detection.matches[0].license_expression_spdx =
1691 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1692 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1693 detection.matches[0].rule_kind = RuleKind::Clue;
1694
1695 let (converted, clues) = convert_detection_to_model(
1696 &detection,
1697 LicenseScanOptions {
1698 include_text: true,
1699 min_score: 0,
1700 ..LicenseScanOptions::default()
1701 },
1702 "clue text",
1703 None,
1704 );
1705
1706 assert!(converted.is_none());
1707 assert_eq!(clues.len(), 1);
1708 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1709 assert_eq!(
1710 clues[0].license_expression_spdx,
1711 "LicenseRef-scancode-unknown-license-reference"
1712 );
1713 assert_eq!(
1714 clues[0].rule_identifier.as_deref(),
1715 Some("license-clue_1.RULE")
1716 );
1717 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1718 assert_eq!(clues[0].matched_text_diagnostics, None);
1719 }
1720
1721 #[test]
1722 fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1723 let dir = tempdir().expect("tempdir");
1724 let path = dir.path().join("broken.pdf");
1725 fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1726 .expect("write malformed pdf");
1727 let metadata = fs::metadata(&path).expect("metadata");
1728 let progress = ScanProgress::new(ProgressMode::Quiet);
1729
1730 let file_info = process_file(
1731 &path,
1732 &metadata,
1733 &progress,
1734 None,
1735 LicenseScanOptions::default(),
1736 &TextDetectionOptions::default(),
1737 );
1738
1739 assert!(file_info.scan_errors.is_empty());
1740 }
1741
1742 #[test]
1743 fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1744 let started = Instant::now() - Duration::from_secs(2);
1745 let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1746
1747 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1748
1749 assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1750 }
1751
1752 #[test]
1753 fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1754 let started = Instant::now() - Duration::from_secs(2);
1755 let mut scan_errors = Vec::new();
1756
1757 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1758
1759 assert_eq!(
1760 scan_errors,
1761 vec!["Processing interrupted due to timeout after 1.00 seconds"]
1762 );
1763 }
1764
1765 #[test]
1766 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1767 let text = concat!(
1768 "Reproduction and distribution of this file, with or without modification, are\n",
1769 "permitted in any medium without royalties provided the copyright notice\n",
1770 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1771 );
1772 let index = create_test_index(
1773 &[
1774 ("reproduction", 0),
1775 ("distribution", 1),
1776 ("file", 2),
1777 ("without", 3),
1778 ("modification", 4),
1779 ("permitted", 5),
1780 ("medium", 6),
1781 ("royalties", 7),
1782 ("provided", 8),
1783 ("copyright", 9),
1784 ("notice", 10),
1785 ("preserved", 11),
1786 ("offered", 12),
1787 ("warranties", 13),
1788 ],
1789 14,
1790 );
1791 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1792 let mut detection = make_detection(
1793 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1794 );
1795 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1796 detection.matches[0].license_expression = "fsf-ap".to_string();
1797 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1798 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1799 detection.matches[0].matched_text = None;
1800 detection.matches[0].start_line = LineNumber::ONE;
1801 detection.matches[0].end_line = LineNumber::new(3).unwrap();
1802 detection.matches[0].start_token = 0;
1803 detection.matches[0].end_token = query.tokens.len();
1804 detection.matches[0].coordinates =
1805 MatchCoordinates::query_region(PositionSpan::from_positions(
1806 query
1807 .tokens
1808 .iter()
1809 .enumerate()
1810 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1811 .collect::<Vec<_>>(),
1812 ));
1813 detection.identifier = Some("fsf_ap-test".to_string());
1814
1815 let (converted, clues) = convert_detection_to_model(
1816 &detection,
1817 LicenseScanOptions {
1818 include_text: true,
1819 include_text_diagnostics: true,
1820 include_diagnostics: true,
1821 unknown_licenses: false,
1822 min_score: 0,
1823 },
1824 text,
1825 Some(&query),
1826 );
1827 let converted = converted.expect("detection should convert");
1828
1829 assert!(clues.is_empty());
1830 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1831 assert_eq!(
1832 converted.matches[0].matched_text.as_deref(),
1833 Some(text.trim_end())
1834 );
1835 let diagnostics = converted.matches[0]
1836 .matched_text_diagnostics
1837 .as_deref()
1838 .expect("diagnostics should be present");
1839 assert!(diagnostics.contains('['));
1840 assert!(diagnostics.contains(']'));
1841 assert_ne!(diagnostics, text.trim_end());
1842 }
1843
1844 #[test]
1845 fn test_extract_email_url_information_skips_binary_string_text() {
1846 let mut builder = FileInfoBuilder::default();
1847 let options = TextDetectionOptions {
1848 collect_info: false,
1849 detect_packages: false,
1850 detect_application_packages: false,
1851 detect_system_packages: false,
1852 detect_packages_in_compiled: false,
1853 detect_copyrights: false,
1854 detect_generated: false,
1855 detect_emails: true,
1856 detect_urls: true,
1857 max_emails: 50,
1858 max_urls: 50,
1859 timeout_seconds: 120.0,
1860 };
1861
1862 extract_email_url_information(
1863 &mut builder,
1864 "contact 6h@fo.lwft and visit http://gmail.com/",
1865 &options,
1866 true,
1867 );
1868
1869 let file = builder
1870 .name("binary.bin".to_string())
1871 .base_name("binary".to_string())
1872 .extension(".bin".to_string())
1873 .path("binary.bin".to_string())
1874 .file_type(FileType::File)
1875 .size(1)
1876 .build()
1877 .expect("builder should produce file info");
1878
1879 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1880 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1881 }
1882
1883 #[test]
1884 fn test_extract_email_url_information_keeps_good_binary_contacts() {
1885 let mut builder = FileInfoBuilder::default();
1886 let options = TextDetectionOptions {
1887 collect_info: false,
1888 detect_packages: false,
1889 detect_application_packages: false,
1890 detect_system_packages: false,
1891 detect_packages_in_compiled: false,
1892 detect_copyrights: false,
1893 detect_generated: false,
1894 detect_emails: true,
1895 detect_urls: true,
1896 max_emails: 50,
1897 max_urls: 50,
1898 timeout_seconds: 120.0,
1899 };
1900
1901 extract_email_url_information(
1902 &mut builder,
1903 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1904 &options,
1905 true,
1906 );
1907
1908 let file = builder
1909 .name("binary.bin".to_string())
1910 .base_name("binary".to_string())
1911 .extension(".bin".to_string())
1912 .path("binary.bin".to_string())
1913 .file_type(FileType::File)
1914 .size(1)
1915 .build()
1916 .expect("builder should produce file info");
1917
1918 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1919 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1920 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1921 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1922 }
1923
1924 #[test]
1925 fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1926 let mut builder = FileInfoBuilder::default();
1927 let options = TextDetectionOptions {
1928 collect_info: false,
1929 detect_packages: false,
1930 detect_application_packages: false,
1931 detect_system_packages: false,
1932 detect_packages_in_compiled: false,
1933 detect_copyrights: false,
1934 detect_generated: false,
1935 detect_emails: true,
1936 detect_urls: false,
1937 max_emails: 2,
1938 max_urls: 50,
1939 timeout_seconds: 120.0,
1940 };
1941
1942 extract_email_url_information(
1943 &mut builder,
1944 "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1945 &options,
1946 true,
1947 );
1948
1949 let file = builder
1950 .name("binary.bin".to_string())
1951 .base_name("binary".to_string())
1952 .extension(".bin".to_string())
1953 .path("binary.bin".to_string())
1954 .file_type(FileType::File)
1955 .size(1)
1956 .build()
1957 .expect("builder should produce file info");
1958
1959 assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1960 assert_eq!(file.emails[0].email, "jakub@redhat.com");
1961 assert_eq!(file.emails[1].email, "contyk@redhat.com");
1962 }
1963
1964 #[test]
1965 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1966 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1967 assert!(!is_binary_string_copyright_candidate(gibberish));
1968 }
1969
1970 #[test]
1971 fn test_binary_string_copyright_candidate_keeps_real_notice() {
1972 let notice = "Copyright nexB and others (c) 2012";
1973 assert!(is_binary_string_copyright_candidate(notice));
1974 }
1975
1976 #[test]
1977 fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1978 assert!(!is_binary_string_copyright_candidate(
1979 "Copyright - split out libs"
1980 ));
1981 }
1982
1983 #[test]
1984 fn test_binary_string_email_candidate_rejects_gibberish() {
1985 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1986 }
1987
1988 #[test]
1989 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1990 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1991 }
1992
1993 #[test]
1994 fn test_binary_string_url_candidate_rejects_short_fake_host() {
1995 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1996 }
1997
1998 #[test]
1999 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
2000 assert!(is_binary_string_url_candidate(
2001 "https://www.gnu.org/software/coreutils/"
2002 ));
2003 }
2004
2005 #[test]
2006 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
2007 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
2008 }
2009
2010 #[test]
2011 fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2012 assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2013 }
2014
2015 #[test]
2016 fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2017 assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2018 }
2019
2020 #[test]
2021 fn test_binary_string_url_candidate_keeps_short_project_path() {
2022 assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2023 }
2024
2025 #[test]
2026 fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2027 assert!(is_binary_string_author_candidate(
2028 "Andreas Schneider <asn@redhat.com>"
2029 ));
2030 }
2031
2032 #[test]
2033 fn test_binary_string_author_candidate_rejects_gibberish() {
2034 assert!(!is_binary_string_author_candidate(
2035 "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2036 ));
2037 }
2038
2039 #[test]
2040 fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2041 assert!(!is_binary_string_author_candidate(
2042 "Developers can enable them. - revert news user back to"
2043 ));
2044 }
2045
2046 #[test]
2047 fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2048 assert_eq!(
2049 extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2050 Some("Andreas Schneider <asn@redhat.com>".to_string())
2051 );
2052 }
2053
2054 #[test]
2055 fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2056 assert_eq!(
2057 extract_named_author_from_binary_line(
2058 "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2059 ),
2060 Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2061 );
2062 }
2063
2064 #[test]
2065 fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2066 assert_eq!(
2067 extract_named_author_from_binary_line(
2068 "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2069 ),
2070 None
2071 );
2072 }
2073
2074 #[test]
2075 fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2076 assert_eq!(
2077 extract_named_author_from_binary_line(
2078 "Changes as per initial review by panemade@gmail.com"
2079 ),
2080 Some("panemade@gmail.com".to_string())
2081 );
2082 }
2083
2084 #[test]
2085 fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2086 assert!(!is_binary_string_author_candidate(
2087 "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2088 ));
2089 }
2090
2091 #[test]
2092 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2093 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2094 let text = "alpha MIT omega";
2095 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2096 let mut detection = make_detection("");
2097 detection.matches[0].coordinates =
2098 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2099 detection.matches[0].start_token = 1;
2100 detection.matches[0].end_token = 2;
2101
2102 let percentage = compute_percentage_of_license_text(&query, &[detection]);
2103
2104 assert_eq!(percentage, 33.33);
2105 }
2106
2107 #[test]
2108 fn test_scan_options_fingerprint_changes_with_license_score() {
2109 let text_options = crate::scanner::TextDetectionOptions::default();
2110 let default_fingerprint = scan_options_fingerprint(
2111 &text_options,
2112 LicenseScanOptions {
2113 min_score: 0,
2114 ..LicenseScanOptions::default()
2115 },
2116 None,
2117 );
2118 let filtered_fingerprint = scan_options_fingerprint(
2119 &text_options,
2120 LicenseScanOptions {
2121 min_score: 70,
2122 ..LicenseScanOptions::default()
2123 },
2124 None,
2125 );
2126
2127 assert_ne!(default_fingerprint, filtered_fingerprint);
2128 }
2129
2130 #[test]
2131 fn test_is_go_non_production_source_for_test_filename() {
2132 let temp_dir = tempdir().unwrap();
2133 let path = temp_dir.path().join("scanner_test.go");
2134 fs::write(&path, "package scanner\n").unwrap();
2135
2136 assert!(is_go_non_production_source(&path).unwrap());
2137 }
2138
2139 #[test]
2140 fn test_is_go_non_production_source_for_build_tag() {
2141 let temp_dir = tempdir().unwrap();
2142 let path = temp_dir.path().join("scanner.go");
2143 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2144
2145 assert!(is_go_non_production_source(&path).unwrap());
2146 }
2147
2148 #[test]
2149 fn test_is_go_non_production_source_for_regular_go_file() {
2150 let temp_dir = tempdir().unwrap();
2151 let path = temp_dir.path().join("scanner.go");
2152 fs::write(&path, "package scanner\n").unwrap();
2153
2154 assert!(!is_go_non_production_source(&path).unwrap());
2155 }
2156}