1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3 is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29 LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36 ExtractedTextKind, augment_license_detection_text, classify_file_info,
37 extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44 (
45 "-----BEGIN TRUSTED CERTIFICATE-----",
46 "-----END TRUSTED CERTIFICATE-----",
47 ),
48];
49
50pub fn process_collected(
51 collected: &CollectedPaths,
52 progress: Arc<ScanProgress>,
53 license_engine: Option<Arc<LicenseDetectionEngine>>,
54 license_options: LicenseScanOptions,
55 text_options: &TextDetectionOptions,
56) -> ProcessResult {
57 let mut all_files: Vec<FileInfo> = collected
58 .files
59 .par_iter()
60 .map(|(path, metadata)| {
61 let file_entry = process_file(
62 path,
63 metadata,
64 progress.as_ref(),
65 license_engine.clone(),
66 license_options,
67 text_options,
68 );
69 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70 file_entry
71 })
72 .collect();
73
74 for (path, metadata) in &collected.directories {
75 all_files.push(process_directory(
76 path,
77 metadata,
78 text_options.collect_info,
79 license_engine.is_some(),
80 ));
81 }
82
83 ProcessResult {
84 files: all_files,
85 excluded_count: collected.excluded_count,
86 }
87}
88
89pub fn process_collected_sequential(
90 collected: &CollectedPaths,
91 progress: Arc<ScanProgress>,
92 license_engine: Option<Arc<LicenseDetectionEngine>>,
93 license_options: LicenseScanOptions,
94 text_options: &TextDetectionOptions,
95) -> ProcessResult {
96 let mut all_files: Vec<FileInfo> =
97 Vec::with_capacity(collected.files.len() + collected.directories.len());
98
99 for (path, metadata) in &collected.files {
100 let file_entry = process_file(
101 path,
102 metadata,
103 progress.as_ref(),
104 license_engine.clone(),
105 license_options,
106 text_options,
107 );
108 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
109 all_files.push(file_entry);
110 }
111
112 for (path, metadata) in &collected.directories {
113 all_files.push(process_directory(
114 path,
115 metadata,
116 text_options.collect_info,
117 license_engine.is_some(),
118 ));
119 }
120
121 ProcessResult {
122 files: all_files,
123 excluded_count: collected.excluded_count,
124 }
125}
126
127pub fn process_collected_with_memory_limit(
128 collected: &CollectedPaths,
129 progress: Arc<ScanProgress>,
130 license_engine: Option<Arc<LicenseDetectionEngine>>,
131 license_options: LicenseScanOptions,
132 text_options: &TextDetectionOptions,
133 max_in_memory: i64,
134) -> ProcessResult {
135 if max_in_memory == 0 {
136 return process_collected(
137 collected,
138 progress,
139 license_engine,
140 license_options,
141 text_options,
142 );
143 }
144
145 let memory_limit = if max_in_memory < 0 {
146 0
147 } else {
148 max_in_memory as usize
149 };
150 let chunk_size = if max_in_memory < 0 {
151 256
152 } else {
153 memory_limit.max(1)
154 };
155
156 let mut retained_files = Vec::new();
157 let mut spill_store = None;
158
159 for chunk in collected.files.chunks(chunk_size) {
160 let processed_chunk: Vec<FileInfo> = chunk
161 .par_iter()
162 .map(|(path, metadata)| {
163 let file_entry = process_file(
164 path,
165 metadata,
166 progress.as_ref(),
167 license_engine.clone(),
168 license_options,
169 text_options,
170 );
171 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
172 file_entry
173 })
174 .collect();
175
176 retain_or_spill_chunk(
177 processed_chunk,
178 &mut retained_files,
179 &mut spill_store,
180 memory_limit,
181 );
182 }
183
184 for (path, metadata) in &collected.directories {
185 let entry = process_directory(
186 path,
187 metadata,
188 text_options.collect_info,
189 license_engine.is_some(),
190 );
191 retain_or_spill_chunk(
192 vec![entry],
193 &mut retained_files,
194 &mut spill_store,
195 memory_limit,
196 );
197 }
198
199 if let Some(spill_store) = spill_store {
200 retained_files.extend(spill_store.load_all());
201 }
202
203 ProcessResult {
204 files: retained_files,
205 excluded_count: collected.excluded_count,
206 }
207}
208
209pub fn process_collected_with_memory_limit_sequential(
210 collected: &CollectedPaths,
211 progress: Arc<ScanProgress>,
212 license_engine: Option<Arc<LicenseDetectionEngine>>,
213 license_options: LicenseScanOptions,
214 text_options: &TextDetectionOptions,
215 max_in_memory: i64,
216) -> ProcessResult {
217 if max_in_memory == 0 {
218 return process_collected_sequential(
219 collected,
220 progress,
221 license_engine,
222 license_options,
223 text_options,
224 );
225 }
226
227 let memory_limit = if max_in_memory < 0 {
228 0
229 } else {
230 max_in_memory as usize
231 };
232 let chunk_size = if max_in_memory < 0 {
233 256
234 } else {
235 memory_limit.max(1)
236 };
237
238 let mut retained_files = Vec::new();
239 let mut spill_store = None;
240
241 for chunk in collected.files.chunks(chunk_size) {
242 let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
243 for (path, metadata) in chunk {
244 let file_entry = process_file(
245 path,
246 metadata,
247 progress.as_ref(),
248 license_engine.clone(),
249 license_options,
250 text_options,
251 );
252 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
253 processed_chunk.push(file_entry);
254 }
255
256 retain_or_spill_chunk(
257 processed_chunk,
258 &mut retained_files,
259 &mut spill_store,
260 memory_limit,
261 );
262 }
263
264 for (path, metadata) in &collected.directories {
265 let entry = process_directory(
266 path,
267 metadata,
268 text_options.collect_info,
269 license_engine.is_some(),
270 );
271 retain_or_spill_chunk(
272 vec![entry],
273 &mut retained_files,
274 &mut spill_store,
275 memory_limit,
276 );
277 }
278
279 if let Some(spill_store) = spill_store {
280 retained_files.extend(spill_store.load_all());
281 }
282
283 ProcessResult {
284 files: retained_files,
285 excluded_count: collected.excluded_count,
286 }
287}
288
289fn retain_or_spill_chunk(
290 chunk: Vec<FileInfo>,
291 retained_files: &mut Vec<FileInfo>,
292 spill_store: &mut Option<FileInfoSpillStore>,
293 memory_limit: usize,
294) {
295 if memory_limit == 0 {
296 spill_store
297 .get_or_insert_with(FileInfoSpillStore::new)
298 .spill(chunk);
299 return;
300 }
301
302 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
303 if remaining_capacity >= chunk.len() && spill_store.is_none() {
304 retained_files.extend(chunk);
305 return;
306 }
307
308 let mut chunk_iter = chunk.into_iter();
309 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
310 let overflow: Vec<FileInfo> = chunk_iter.collect();
311 if !overflow.is_empty() {
312 spill_store
313 .get_or_insert_with(FileInfoSpillStore::new)
314 .spill(overflow);
315 }
316}
317
318struct FileInfoSpillStore {
319 temp_dir: TempDir,
320 batch_index: usize,
321}
322
323impl FileInfoSpillStore {
324 fn new() -> Self {
325 Self {
326 temp_dir: TempDir::new().expect("create spill dir"),
327 batch_index: 0,
328 }
329 }
330
331 fn spill(&mut self, files: Vec<FileInfo>) {
332 let path = self
333 .temp_dir
334 .path()
335 .join(format!("batch-{:06}.json.zst", self.batch_index));
336 self.batch_index += 1;
337
338 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
339 let file = File::create(path).expect("create spill batch file");
340 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
341 encoder
342 .write_all(&payload)
343 .expect("write spilled file batch");
344 encoder.finish().expect("finish spill encoder");
345 }
346
347 fn load_all(self) -> Vec<FileInfo> {
348 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
349 .expect("read spill dir")
350 .filter_map(Result::ok)
351 .map(|entry| entry.path())
352 .collect();
353 paths.sort();
354
355 let mut files = Vec::new();
356 for path in paths {
357 let file = File::open(path).expect("open spill batch");
358 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
359 let mut payload = Vec::new();
360 decoder.read_to_end(&mut payload).expect("read spill batch");
361 let mut batch: Vec<FileInfo> =
362 serde_json::from_slice(&payload).expect("decode spilled file batch");
363 files.append(&mut batch);
364 }
365 files
366 }
367}
368
369fn process_file(
370 path: &Path,
371 metadata: &fs::Metadata,
372 progress: &ScanProgress,
373 license_engine: Option<Arc<LicenseDetectionEngine>>,
374 license_options: LicenseScanOptions,
375 text_options: &TextDetectionOptions,
376) -> FileInfo {
377 let mut scan_errors: Vec<String> = vec![];
378 let mut file_info_builder = FileInfoBuilder::default();
379 let license_enabled = license_engine.is_some();
380
381 let started = Instant::now();
382
383 let mut generated_flag = None;
384 let mut is_source_file = false;
385 match extract_information_from_content(
386 &mut file_info_builder,
387 &mut scan_errors,
388 path,
389 progress,
390 license_engine,
391 license_options,
392 text_options,
393 ) {
394 Ok((is_generated, sha256, is_source)) => {
395 generated_flag = is_generated;
396 is_source_file = is_source;
397 let _ = sha256;
398 }
399 Err(e) => scan_errors.push(e.to_string()),
400 };
401
402 maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
403
404 let mut file_info = file_info_builder
405 .name(path.file_name().unwrap().to_string_lossy().to_string())
406 .base_name(
407 path.file_stem()
408 .unwrap_or_default()
409 .to_string_lossy()
410 .to_string(),
411 )
412 .extension(
413 path.extension()
414 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
415 )
416 .path(path.to_string_lossy().to_string())
417 .file_type(FileType::File)
418 .size(metadata.len())
419 .date(
420 text_options
421 .collect_info
422 .then(|| get_creation_date(metadata))
423 .flatten(),
424 )
425 .scan_errors(scan_errors)
426 .build()
427 .expect("FileInformationBuild not completely initialized");
428
429 if text_options.collect_info {
430 file_info.is_source = Some(is_source_file);
431 }
432
433 if file_info.programming_language.as_deref() == Some("Go")
434 && is_go_non_production_source(path).unwrap_or(false)
435 {
436 file_info.is_source = Some(false);
437 }
438
439 if text_options.detect_generated {
440 file_info.is_generated = Some(generated_flag.unwrap_or(false));
441 }
442
443 if file_info.percentage_of_license_text.is_none() && license_enabled {
444 file_info.percentage_of_license_text = Some(0.0);
445 }
446
447 file_info
448}
449
450fn extract_information_from_content(
451 file_info_builder: &mut FileInfoBuilder,
452 scan_errors: &mut Vec<String>,
453 path: &Path,
454 progress: &ScanProgress,
455 license_engine: Option<Arc<LicenseDetectionEngine>>,
456 license_options: LicenseScanOptions,
457 text_options: &TextDetectionOptions,
458) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
459 let started = Instant::now();
460 let buffer = fs::read(path)?;
461 let license_enabled = license_engine.is_some();
462
463 if is_timeout_exceeded(started, text_options.timeout_seconds) {
464 return Err(Error::msg(format!(
465 "Timeout while reading file content (> {:.2}s)",
466 text_options.timeout_seconds
467 )));
468 }
469
470 let sha256 = calculate_sha256(&buffer);
471 let is_generated = text_options
472 .detect_generated
473 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
474 let classification = classify_file_info(path, &buffer);
475
476 if text_options.collect_info {
477 file_info_builder
478 .sha1(Some(calculate_sha1(&buffer)))
479 .md5(Some(calculate_md5(&buffer)))
480 .sha256(Some(sha256))
481 .programming_language(classification.programming_language.clone())
482 .mime_type(Some(classification.mime_type.clone()))
483 .file_type_label(Some(classification.file_type.clone()))
484 .sha1_git(Some(calculate_sha1_git(&buffer)))
485 .is_binary(Some(classification.is_binary))
486 .is_text(Some(classification.is_text))
487 .is_archive(Some(classification.is_archive))
488 .is_media(Some(classification.is_media))
489 .is_source(Some(classification.is_source))
490 .is_script(Some(classification.is_script))
491 .files_count(Some(0))
492 .dirs_count(Some(0))
493 .size_count(Some(0));
494 }
495
496 if should_skip_text_detection(path, &buffer) {
497 return Ok((is_generated, sha256, classification.is_source));
498 }
499
500 if text_options.detect_packages {
503 let started = Instant::now();
504 let parse_result = try_parse_file(path)
505 .or_else(|| {
506 text_options
507 .detect_application_packages
508 .then(|| try_parse_windows_executable_bytes(path, &buffer))
509 .flatten()
510 })
511 .or_else(|| {
512 text_options
513 .detect_packages_in_compiled
514 .then(|| {
515 (classification.is_binary && is_supported_compiled_binary_format(&buffer))
516 .then(|| try_parse_compiled_bytes(&buffer))
517 .flatten()
518 })
519 .flatten()
520 });
521
522 if let Some(parse_result) = parse_result {
523 let packages = parse_result
524 .packages
525 .into_iter()
526 .filter(|package| {
527 let is_compiled_package = package
528 .datasource_id
529 .as_ref()
530 .is_some_and(is_compiled_datasource);
531 let is_system_package = package
532 .datasource_id
533 .as_ref()
534 .is_some_and(is_system_datasource);
535 if is_compiled_package {
536 text_options.detect_packages_in_compiled
537 } else if is_system_package {
538 text_options.detect_system_packages
539 } else {
540 text_options.detect_application_packages
541 }
542 })
543 .collect();
544 file_info_builder.package_data(packages);
545 scan_errors.extend(parse_result.scan_errors);
546 }
547 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
548 }
549
550 if is_timeout_exceeded(started, text_options.timeout_seconds) {
551 return Err(Error::msg(format!(
552 "Timeout while extracting package/text metadata (> {:.2}s)",
553 text_options.timeout_seconds
554 )));
555 }
556
557 let (text_content, text_kind, text_scan_error) =
558 extract_text_for_detection_with_diagnostics(path, &buffer);
559 if let Some(text_scan_error) = text_scan_error {
560 scan_errors.push(text_scan_error);
561 }
562 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
563
564 if is_timeout_exceeded(started, text_options.timeout_seconds) {
565 return Err(Error::msg(format!(
566 "Timeout while extracting text content (> {:.2}s)",
567 text_options.timeout_seconds
568 )));
569 }
570
571 if text_content.is_empty() {
572 return Ok((is_generated, sha256, classification.is_source));
573 }
574
575 if text_options.detect_copyrights {
576 extract_copyright_information(
577 file_info_builder,
578 path,
579 &text_content,
580 text_options.timeout_seconds,
581 from_binary_strings,
582 );
583 }
584 extract_email_url_information(
585 file_info_builder,
586 &text_content,
587 text_options,
588 from_binary_strings,
589 );
590
591 if is_timeout_exceeded(started, text_options.timeout_seconds) {
592 return Err(Error::msg(format!(
593 "Timeout before license scan (> {:.2}s)",
594 text_options.timeout_seconds
595 )));
596 }
597 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
599 if let Some(sourcemap_content) =
600 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
601 {
602 sourcemap_content
603 } else {
604 text_content
605 }
606 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
607 remove_verbatim_escape_sequences(&text_content)
608 } else {
609 text_content
610 };
611 let text_content_for_license_detection =
612 augment_license_detection_text(path, &text_content_for_license_detection);
613 let text_content_for_license_detection = text_content_for_license_detection.into_owned();
614
615 if license_enabled {
616 let started = Instant::now();
617 extract_license_information(
618 file_info_builder,
619 scan_errors,
620 path,
621 text_content_for_license_detection.clone(),
622 license_engine,
623 license_options,
624 from_binary_strings,
625 )?;
626 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
627 } else {
628 extract_license_information(
629 file_info_builder,
630 scan_errors,
631 path,
632 text_content_for_license_detection,
633 license_engine,
634 license_options,
635 from_binary_strings,
636 )?;
637 }
638
639 Ok((is_generated, sha256, classification.is_source))
640}
641
642fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
643 timeout_seconds.is_finite()
644 && timeout_seconds > 0.0
645 && started.elapsed().as_secs_f64() > timeout_seconds
646}
647
648fn maybe_record_processing_timeout(
649 scan_errors: &mut Vec<String>,
650 started: Instant,
651 timeout_seconds: f64,
652) {
653 if is_timeout_exceeded(started, timeout_seconds)
654 && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
655 {
656 scan_errors.push(format!(
657 "Processing interrupted due to timeout after {:.2} seconds",
658 timeout_seconds
659 ));
660 }
661}
662
663fn is_timeout_scan_error(error: &str) -> bool {
664 error.contains("Timeout while ")
665 || error.contains("Timeout before ")
666 || error.contains("Processing interrupted due to timeout")
667}
668
669fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
670 matches!(
671 datasource_id,
672 DatasourceId::AlpineInstalledDb
673 | DatasourceId::DebianDistrolessInstalledDb
674 | DatasourceId::DebianInstalledFilesList
675 | DatasourceId::DebianInstalledMd5Sums
676 | DatasourceId::DebianInstalledStatusDb
677 | DatasourceId::FreebsdCompactManifest
678 | DatasourceId::RpmInstalledDatabaseBdb
679 | DatasourceId::RpmInstalledDatabaseNdb
680 | DatasourceId::RpmInstalledDatabaseSqlite
681 | DatasourceId::RpmYumdb
682 )
683}
684
685fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
686 matches!(
687 datasource_id,
688 DatasourceId::GoBinary | DatasourceId::RustBinary
689 )
690}
691
692fn extract_copyright_information(
693 file_info_builder: &mut FileInfoBuilder,
694 path: &Path,
695 text_content: &str,
696 timeout_seconds: f64,
697 from_binary_strings: bool,
698) {
699 if copyright::is_credits_file(path) {
701 let author_detections = copyright::detect_credits_authors(text_content);
702 if !author_detections.is_empty() {
703 file_info_builder.authors(
704 author_detections
705 .into_iter()
706 .map(|a| Author {
707 author: a.author,
708 start_line: a.start_line,
709 end_line: a.end_line,
710 })
711 .collect(),
712 );
713 return;
714 }
715 }
716
717 let copyright_options = CopyrightDetectionOptions {
718 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
719 Some(Duration::from_secs_f64(timeout_seconds))
720 } else {
721 None
722 },
723 ..CopyrightDetectionOptions::default()
724 };
725
726 let (copyrights, holders, authors) =
727 copyright::detect_copyrights_with_options(text_content, ©right_options);
728 let (copyrights, holders, authors) = if from_binary_strings {
729 prune_binary_string_detections(text_content, copyrights, holders, authors)
730 } else {
731 (copyrights, holders, authors)
732 };
733
734 file_info_builder.copyrights(
735 copyrights
736 .into_iter()
737 .map(|c| Copyright {
738 copyright: c.copyright,
739 start_line: c.start_line,
740 end_line: c.end_line,
741 })
742 .collect::<Vec<Copyright>>(),
743 );
744 file_info_builder.holders(
745 holders
746 .into_iter()
747 .map(|h| Holder {
748 holder: h.holder,
749 start_line: h.start_line,
750 end_line: h.end_line,
751 })
752 .collect::<Vec<Holder>>(),
753 );
754 file_info_builder.authors(
755 authors
756 .into_iter()
757 .map(|a| Author {
758 author: a.author,
759 start_line: a.start_line,
760 end_line: a.end_line,
761 })
762 .collect::<Vec<Author>>(),
763 );
764}
765
766fn prune_binary_string_detections(
767 text_content: &str,
768 copyrights: Vec<CopyrightDetection>,
769 holders: Vec<HolderDetection>,
770 authors: Vec<AuthorDetection>,
771) -> (
772 Vec<CopyrightDetection>,
773 Vec<HolderDetection>,
774 Vec<AuthorDetection>,
775) {
776 let kept_copyrights: Vec<CopyrightDetection> = copyrights
777 .into_iter()
778 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
779 .collect();
780
781 let kept_holders: Vec<HolderDetection> = holders
782 .into_iter()
783 .filter(|holder| {
784 kept_copyrights.iter().any(|copyright| {
785 ranges_overlap(
786 holder.start_line,
787 holder.end_line,
788 copyright.start_line,
789 copyright.end_line,
790 )
791 })
792 })
793 .collect();
794
795 let kept_authors = authors
796 .into_iter()
797 .filter(|author| is_binary_string_author_candidate(&author.author))
798 .chain(extract_binary_string_author_supplements(text_content))
799 .filter({
800 let mut seen = HashSet::new();
801 move |author| seen.insert(author.author.clone())
802 })
803 .collect();
804
805 (kept_copyrights, kept_holders, kept_authors)
806}
807
808fn ranges_overlap(
809 a_start: LineNumber,
810 a_end: LineNumber,
811 b_start: LineNumber,
812 b_end: LineNumber,
813) -> bool {
814 a_start <= b_end && b_start <= a_end
815}
816
817fn is_binary_string_copyright_candidate(text: &str) -> bool {
818 if contains_year(text) {
819 return true;
820 }
821
822 let trimmed = text.trim();
823 let lower = trimmed.to_ascii_lowercase();
824 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
825 tail.trim()
826 } else {
827 lower.trim()
828 };
829 let original_tail = if lower.starts_with("copyright") {
830 trimmed["copyright".len()..].trim()
831 } else {
832 trimmed
833 };
834
835 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
836 return false;
837 }
838
839 let alpha_tokens: Vec<&str> = tail
840 .split_whitespace()
841 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
842 .collect();
843
844 if alpha_tokens.len() <= 1 {
845 return has_explicit_copyright_marker(text)
846 && alpha_tokens.iter().any(|token| {
847 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
848 });
849 }
850
851 if !has_explicit_copyright_marker(text) {
852 return false;
853 }
854
855 has_binary_name_like_shape(original_tail)
856}
857
858fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
859 let mut authors = Vec::new();
860
861 for (line_index, line) in text_content.lines().enumerate() {
862 if let Some(author) = extract_named_author_from_binary_line(line) {
863 authors.push(AuthorDetection {
864 author,
865 start_line: LineNumber::from_0_indexed(line_index),
866 end_line: LineNumber::from_0_indexed(line_index),
867 });
868 }
869 }
870
871 authors
872}
873
874fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
875 let line = line.trim();
876 if line.is_empty() {
877 return None;
878 }
879
880 let emails = finder::find_emails(
881 line,
882 &DetectionConfig {
883 max_emails: 4,
884 max_urls: 0,
885 unique: false,
886 },
887 );
888 let email = emails.first()?.email.as_str();
889 if !is_binary_string_email_candidate(email) {
890 return None;
891 }
892
893 let lower_line = line.to_ascii_lowercase();
894 let email_start = lower_line.find(email)?;
895 let raw_prefix = &line[..email_start];
896 let has_author_marker = contains_binary_author_marker(raw_prefix);
897 let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
898 let prefix = prefix
899 .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
900 .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
901 .trim();
902
903 let (name, _) = split_name_email(prefix);
904 let name = name.or_else(|| {
905 let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
906 (!trimmed.is_empty()).then(|| trimmed.to_string())
907 });
908
909 let Some(name) = name.map(|name| name.trim().to_string()) else {
910 if has_author_marker {
911 return Some(email.to_string());
912 }
913 return None;
914 };
915
916 if name.is_empty() && has_author_marker {
917 return Some(email.to_string());
918 }
919
920 if !has_binary_name_like_shape(&name) {
921 return None;
922 }
923
924 if line.contains(&format!("<{email}>")) {
925 Some(format!("{name} <{email}>"))
926 } else if line.contains(&format!("({email})")) {
927 Some(format!("{name} ({email})"))
928 } else {
929 Some(format!("{name} {email}"))
930 }
931}
932
933fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
934 let lower = text.to_ascii_lowercase();
935 let idx = lower.rfind(marker)?;
936 Some(text[idx + marker.len()..].trim())
937}
938
939fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
940 const MARKERS: &[&str] = &[
941 " patch author: ",
942 " patch author ",
943 " written by ",
944 " contributed by ",
945 " original work done by ",
946 " work done by ",
947 " thanks to ",
948 " review by ",
949 " by ",
950 " from ",
951 ];
952
953 MARKERS
954 .iter()
955 .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
956 .next()
957}
958
959fn contains_binary_author_marker(text: &str) -> bool {
960 take_suffix_after_last_author_marker(text).is_some()
961}
962
963fn has_binary_name_like_shape(text: &str) -> bool {
964 let trimmed = text.trim();
965 if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
966 {
967 return false;
968 }
969
970 let tokens: Vec<&str> = trimmed
971 .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
972 .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
973 .collect();
974 if tokens.is_empty() {
975 return false;
976 }
977
978 let uppercase_like = tokens
979 .iter()
980 .filter(|token| {
981 let token = token.trim_matches('.');
982 token
983 .chars()
984 .find(|c| c.is_ascii_alphabetic())
985 .is_some_and(|c| c.is_ascii_uppercase())
986 })
987 .count();
988
989 uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
990 || tokens
991 .iter()
992 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
993}
994
995fn has_sufficient_alphabetic_content(text: &str) -> bool {
996 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
997 if alnum_count == 0 {
998 return false;
999 }
1000
1001 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1002 alpha_count * 2 >= alnum_count
1003}
1004
1005fn has_excessive_at_noise(text: &str) -> bool {
1006 text.chars().filter(|c| *c == '@').count() >= 3
1007}
1008
1009fn has_explicit_copyright_marker(text: &str) -> bool {
1010 let lower = text.to_ascii_lowercase();
1011 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1012}
1013
1014fn contains_year(text: &str) -> bool {
1015 let bytes = text.as_bytes();
1016 bytes.windows(4).any(|window| {
1017 window.iter().all(|b| b.is_ascii_digit())
1018 && matches!(window[0], b'1' | b'2')
1019 && matches!(window[1], b'9' | b'0')
1020 })
1021}
1022
1023fn is_company_like_suffix(token: &str) -> bool {
1024 matches!(
1025 token.to_ascii_lowercase().as_str(),
1026 "inc"
1027 | "corp"
1028 | "corporation"
1029 | "co"
1030 | "company"
1031 | "ltd"
1032 | "llc"
1033 | "gmbh"
1034 | "foundation"
1035 | "project"
1036 | "systems"
1037 | "software"
1038 | "technologies"
1039 | "technology"
1040 )
1041}
1042
1043fn extract_email_url_information(
1044 file_info_builder: &mut FileInfoBuilder,
1045 text_content: &str,
1046 text_options: &TextDetectionOptions,
1047 from_binary_strings: bool,
1048) {
1049 if !text_options.detect_emails && !text_options.detect_urls {
1050 return;
1051 }
1052
1053 if text_options.detect_emails {
1054 let config = DetectionConfig {
1055 max_emails: text_options.max_emails,
1056 max_urls: text_options.max_urls,
1057 unique: from_binary_strings,
1058 };
1059 let emails = finder::find_emails(text_content, &config)
1060 .into_iter()
1061 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1062 .map(|d| OutputEmail {
1063 email: d.email,
1064 start_line: d.start_line,
1065 end_line: d.end_line,
1066 })
1067 .collect::<Vec<_>>();
1068 file_info_builder.emails(emails);
1069 }
1070
1071 if text_options.detect_urls {
1072 let config = DetectionConfig {
1073 max_emails: text_options.max_emails,
1074 max_urls: text_options.max_urls,
1075 unique: true,
1076 };
1077 let urls = finder::find_urls(text_content, &config)
1078 .into_iter()
1079 .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
1080 .map(|d| OutputURL {
1081 url: d.url,
1082 start_line: d.start_line,
1083 end_line: d.end_line,
1084 })
1085 .collect::<Vec<_>>();
1086 file_info_builder.urls(urls);
1087 }
1088}
1089
1090fn is_binary_string_email_candidate(email: &str) -> bool {
1091 let Some((local, domain)) = email.rsplit_once('@') else {
1092 return false;
1093 };
1094
1095 if !has_strong_binary_local_part(local) {
1096 return false;
1097 }
1098
1099 has_strong_binary_host_shape(domain)
1100}
1101
1102fn is_binary_string_url_candidate(url: &str) -> bool {
1103 let parsed = url::Url::parse(url).ok();
1104 let Some(parsed) = parsed else {
1105 return false;
1106 };
1107 let Some(host) = parsed.host_str() else {
1108 return false;
1109 };
1110
1111 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1112}
1113
1114fn is_binary_string_author_candidate(author: &str) -> bool {
1115 let trimmed = author.trim();
1116 if trimmed.is_empty()
1117 || !has_sufficient_alphabetic_content(trimmed)
1118 || has_excessive_at_noise(trimmed)
1119 {
1120 return false;
1121 }
1122
1123 if trimmed.contains('@') {
1124 let emails = finder::find_emails(
1125 trimmed,
1126 &DetectionConfig {
1127 max_emails: 4,
1128 max_urls: 0,
1129 unique: true,
1130 },
1131 );
1132 if emails.len() > 1 {
1133 return false;
1134 }
1135
1136 if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1137 return !extracted.is_empty();
1138 }
1139
1140 let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1141 return false;
1142 };
1143 if !is_binary_string_email_candidate(email) {
1144 return false;
1145 }
1146
1147 let (name, _) = split_name_email(trimmed);
1148 return name.as_deref().is_some_and(has_binary_name_like_shape);
1149 }
1150
1151 has_binary_name_like_shape(trimmed)
1152}
1153
1154fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1155 if parsed.path() != "/"
1156 && parsed
1157 .path()
1158 .split('/')
1159 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1160 {
1161 return true;
1162 }
1163
1164 if parsed.query().is_some() || parsed.fragment().is_some() {
1165 return true;
1166 }
1167
1168 let Some(host) = parsed.host_str() else {
1169 return false;
1170 };
1171
1172 let labels: Vec<&str> = host.split('.').collect();
1173 if labels.len() > 2 {
1174 return labels[..labels.len() - 1].iter().any(|label| {
1175 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1176 });
1177 }
1178
1179 if matches!(labels.first(), Some(&"www")) {
1180 return true;
1181 }
1182
1183 if labels.len() == 2 {
1184 let domain = labels[0];
1185 let tld = labels[1];
1186 if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1187 return true;
1188 }
1189 }
1190
1191 labels
1192 .iter()
1193 .take(labels.len().saturating_sub(1))
1194 .any(|label| {
1195 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1196 })
1197}
1198
1199fn has_strong_binary_local_part(local: &str) -> bool {
1200 local
1201 .split(|c: char| !c.is_ascii_alphabetic())
1202 .any(|segment| segment.len() >= 3)
1203}
1204
1205fn has_strong_binary_host_shape(host: &str) -> bool {
1206 let labels: Vec<&str> = host.split('.').collect();
1207 if labels.len() < 2 {
1208 return false;
1209 }
1210
1211 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1212 &labels[1..]
1213 } else {
1214 &labels[..]
1215 };
1216
1217 if relevant.len() < 2 {
1218 return false;
1219 }
1220
1221 relevant[..relevant.len() - 1].iter().any(|label| {
1222 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1223 })
1224}
1225
1226fn extract_license_information(
1227 file_info_builder: &mut FileInfoBuilder,
1228 scan_errors: &mut Vec<String>,
1229 path: &Path,
1230 text_content: String,
1231 license_engine: Option<Arc<LicenseDetectionEngine>>,
1232 license_options: LicenseScanOptions,
1233 from_binary_strings: bool,
1234) -> Result<(), Error> {
1235 let Some(engine) = license_engine else {
1236 return Ok(());
1237 };
1238
1239 let detection_result = if license_options.min_score == 0 {
1240 engine.detect_with_kind_and_source(
1241 &text_content,
1242 license_options.unknown_licenses,
1243 from_binary_strings,
1244 &path.to_string_lossy(),
1245 )
1246 } else {
1247 engine.detect_with_kind_and_source_with_score(
1248 &text_content,
1249 license_options.unknown_licenses,
1250 from_binary_strings,
1251 &path.to_string_lossy(),
1252 license_options.min_score as f32,
1253 )
1254 };
1255
1256 match detection_result {
1257 Ok(detections) => {
1258 let query =
1259 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1260 let mut model_detections = Vec::new();
1261 let mut model_clues = Vec::new();
1262
1263 for detection in &detections {
1264 let (public_detection, clue_matches) = convert_detection_to_model(
1265 detection,
1266 license_options,
1267 &text_content,
1268 query.as_ref(),
1269 );
1270
1271 if let Some(public_detection) = public_detection {
1272 model_detections.push(public_detection);
1273 }
1274
1275 model_clues.extend(clue_matches);
1276 }
1277
1278 if !model_detections.is_empty() {
1279 let expressions: Vec<String> = model_detections
1280 .iter()
1281 .filter(|d| !d.license_expression_spdx.is_empty())
1282 .map(|d| d.license_expression_spdx.clone())
1283 .collect();
1284
1285 if !expressions.is_empty() {
1286 let combined = crate::utils::spdx::combine_license_expressions(expressions);
1287 if let Some(expr) = combined {
1288 file_info_builder.license_expression(Some(expr));
1289 }
1290 }
1291 }
1292
1293 file_info_builder.license_detections(model_detections);
1294 file_info_builder.license_clues(model_clues);
1295 file_info_builder.percentage_of_license_text(
1296 query
1297 .as_ref()
1298 .map(|query| compute_percentage_of_license_text(query, &detections)),
1299 );
1300 }
1301 Err(e) => {
1302 scan_errors.push(format!("License detection failed: {}", e));
1303 }
1304 }
1305
1306 Ok(())
1307}
1308
1309fn convert_detection_to_model(
1310 detection: &crate::license_detection::LicenseDetection,
1311 license_options: LicenseScanOptions,
1312 text_content: &str,
1313 query: Option<&Query<'_>>,
1314) -> (Option<LicenseDetection>, Vec<Match>) {
1315 let matches: Vec<Match> = detection
1316 .matches
1317 .iter()
1318 .map(|m| convert_match_to_model(m, license_options, text_content, query))
1319 .collect();
1320
1321 if let Some(license_expression) = detection.license_expression.clone() {
1322 (
1323 Some(LicenseDetection {
1324 license_expression,
1325 license_expression_spdx: detection
1326 .license_expression_spdx
1327 .clone()
1328 .unwrap_or_default(),
1329 matches,
1330 detection_log: if license_options.include_diagnostics {
1331 detection.detection_log.clone()
1332 } else {
1333 Vec::new()
1334 },
1335 identifier: detection.identifier.clone(),
1336 }),
1337 Vec::new(),
1338 )
1339 } else {
1340 (None, matches)
1341 }
1342}
1343
1344fn convert_match_to_model(
1345 m: &crate::license_detection::models::LicenseMatch,
1346 license_options: LicenseScanOptions,
1347 text_content: &str,
1348 query: Option<&Query<'_>>,
1349) -> Match {
1350 let rule_url = if m.rule_url.is_empty() {
1351 None
1352 } else {
1353 Some(m.rule_url.clone())
1354 };
1355 let matched_text = if license_options.include_text {
1356 m.matched_text.clone().or_else(|| {
1357 Some(crate::license_detection::query::matched_text_from_text(
1358 text_content,
1359 m.start_line.get(),
1360 m.end_line.get(),
1361 ))
1362 })
1363 } else {
1364 None
1365 };
1366 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1367 query.map(|query| matched_text_diagnostics_from_match(query, m))
1368 } else {
1369 None
1370 };
1371 Match {
1372 license_expression: m.license_expression.clone(),
1373 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1374 from_file: m.from_file.clone(),
1375 start_line: m.start_line,
1376 end_line: m.end_line,
1377 matcher: Some(m.matcher.to_string()),
1378 score: m.score,
1379 matched_length: Some(m.matched_length),
1380 match_coverage: Some(((m.coverage() as f64) * 100.0).round() / 100.0),
1381 rule_relevance: Some(m.rule_relevance),
1382 rule_identifier: Some(m.rule_identifier.clone()),
1383 rule_url,
1384 matched_text,
1385 referenced_filenames: m.referenced_filenames.clone(),
1386 matched_text_diagnostics,
1387 }
1388}
1389
1390fn compute_percentage_of_license_text(
1391 query: &Query<'_>,
1392 detections: &[crate::license_detection::LicenseDetection],
1393) -> f64 {
1394 let matched_positions: std::collections::HashSet<usize> = detections
1395 .iter()
1396 .flat_map(|detection| detection.matches.iter())
1397 .flat_map(|m| m.query_span().iter())
1398 .collect();
1399
1400 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1401 if query_tokens_length == 0 {
1402 return 0.0;
1403 }
1404
1405 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1406 (percentage * 100.0).round() / 100.0
1407}
1408
1409fn matched_text_diagnostics_from_match(
1410 query: &Query<'_>,
1411 license_match: &InternalLicenseMatch,
1412) -> String {
1413 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1414 let Some(start_pos) = matched_positions.iter().min() else {
1415 return crate::license_detection::query::matched_text_from_text(
1416 &query.text,
1417 license_match.start_line.get(),
1418 license_match.end_line.get(),
1419 );
1420 };
1421 let Some(end_pos) = matched_positions.iter().max() else {
1422 return crate::license_detection::query::matched_text_from_text(
1423 &query.text,
1424 license_match.start_line.get(),
1425 license_match.end_line.get(),
1426 );
1427 };
1428
1429 crate::license_detection::query::matched_text_diagnostics_from_text(
1430 &query.text,
1431 query,
1432 &matched_positions,
1433 start_pos,
1434 end_pos,
1435 license_match.start_line.get(),
1436 license_match.end_line.get(),
1437 )
1438}
1439
1440fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1441 is_pem_certificate_file(path, buffer)
1442}
1443
1444fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1445 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1446 return Ok(false);
1447 }
1448
1449 if path
1450 .file_name()
1451 .and_then(|name| name.to_str())
1452 .is_some_and(|name| name.ends_with("_test.go"))
1453 {
1454 return Ok(true);
1455 }
1456
1457 let content = fs::read_to_string(path)?;
1458 Ok(content.lines().take(10).any(|line| {
1459 let trimmed = line.trim();
1460 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1461 && trimmed.split_whitespace().any(|token| token == "test")
1462 }))
1463}
1464
1465fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1466 let prefix_len = buffer.len().min(8192);
1467 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1468 let trimmed_lines: Vec<&str> = prefix
1469 .lines()
1470 .map(str::trim)
1471 .filter(|line| !line.is_empty())
1472 .take(64)
1473 .collect();
1474
1475 let Some(first_line) = trimmed_lines.first().copied() else {
1476 return false;
1477 };
1478
1479 PEM_CERTIFICATE_HEADERS
1480 .iter()
1481 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1482}
1483
1484fn process_directory(
1485 path: &Path,
1486 _metadata: &fs::Metadata,
1487 collect_info: bool,
1488 license_enabled: bool,
1489) -> FileInfo {
1490 let name = path
1491 .file_name()
1492 .unwrap_or_default()
1493 .to_string_lossy()
1494 .to_string();
1495 let base_name = name.clone(); FileInfo {
1498 name,
1499 base_name,
1500 extension: "".to_string(),
1501 path: path.to_string_lossy().to_string(),
1502 file_type: FileType::Directory,
1503 mime_type: None,
1504 file_type_label: None,
1505 size: 0,
1506 date: None,
1507 sha1: None,
1508 md5: None,
1509 sha256: None,
1510 sha1_git: None,
1511 programming_language: None,
1512 package_data: Vec::new(),
1513 license_expression: None,
1514 license_detections: Vec::new(),
1515 license_clues: Vec::new(),
1516 percentage_of_license_text: license_enabled.then_some(0.0),
1517 copyrights: Vec::new(),
1518 holders: Vec::new(),
1519 authors: Vec::new(),
1520 emails: Vec::new(),
1521 urls: Vec::new(),
1522 for_packages: Vec::new(),
1523 scan_errors: Vec::new(),
1524 license_policy: None,
1525 is_binary: collect_info.then_some(false),
1526 is_text: collect_info.then_some(false),
1527 is_archive: collect_info.then_some(false),
1528 is_media: collect_info.then_some(false),
1529 is_source: collect_info.then_some(false),
1530 is_script: collect_info.then_some(false),
1531 files_count: collect_info.then_some(0),
1532 dirs_count: collect_info.then_some(0),
1533 size_count: collect_info.then_some(0),
1534 source_count: None,
1535 is_legal: false,
1536 is_manifest: false,
1537 is_readme: false,
1538 is_top_level: false,
1539 is_key_file: false,
1540 is_community: false,
1541 is_generated: None,
1542 facets: vec![],
1543 tallies: None,
1544 }
1545}
1546
1547#[cfg(test)]
1548mod tests {
1549 use super::{
1550 compute_percentage_of_license_text, convert_detection_to_model,
1551 extract_email_url_information, extract_named_author_from_binary_line,
1552 is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1553 is_binary_string_email_candidate, is_binary_string_url_candidate,
1554 is_go_non_production_source, process_file,
1555 };
1556 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1557 use crate::license_detection::index::LicenseIndex;
1558 use crate::license_detection::index::dictionary::TokenDictionary;
1559 use crate::license_detection::models::position_span::PositionSpan;
1560 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1561 use crate::license_detection::query::Query;
1562 use crate::models::{FileInfoBuilder, FileType, MatchScore};
1563 use crate::progress::{ProgressMode, ScanProgress};
1564 use crate::scanner::scan_options_fingerprint;
1565 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1566 use std::fs;
1567 use std::time::{Duration, Instant};
1568 use tempfile::tempdir;
1569
1570 use super::maybe_record_processing_timeout;
1571
1572 use crate::models::LineNumber;
1573
1574 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1575 LicenseMatch {
1576 rid: 0,
1577 license_expression: "mit".to_string(),
1578 license_expression_spdx: Some("MIT".to_string()),
1579 from_file: None,
1580 start_line: LineNumber::ONE,
1581 end_line: LineNumber::ONE,
1582 start_token: 0,
1583 end_token: 1,
1584 matcher: MatcherKind::Hash,
1585 score: MatchScore::from_percentage(1.0),
1586 matched_length: 3,
1587 rule_length: 3,
1588 match_coverage: 100.0,
1589 rule_relevance: 100,
1590 rule_identifier: "mit.LICENSE".to_string(),
1591 rule_url: rule_url.to_string(),
1592 matched_text: Some("MIT".to_string()),
1593 referenced_filenames: None,
1594 rule_kind: RuleKind::Text,
1595 is_from_license: true,
1596 rule_start_token: 0,
1597 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1598 candidate_resemblance: 0.0,
1599 candidate_containment: 0.0,
1600 }
1601 }
1602
1603 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1604 InternalLicenseDetection {
1605 license_expression: Some("mit".to_string()),
1606 license_expression_spdx: Some("MIT".to_string()),
1607 matches: vec![make_internal_match(rule_url)],
1608 detection_log: vec![],
1609 identifier: Some("mit-test".to_string()),
1610 file_regions: Vec::new(),
1611 }
1612 }
1613
1614 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1615 let dictionary = TokenDictionary::new_with_legalese(entries);
1616 let mut index = LicenseIndex::new(dictionary);
1617 index.len_legalese = len_legalese;
1618 index
1619 }
1620
1621 #[test]
1622 fn test_convert_detection_to_model_preserves_rule_url() {
1623 let detection = make_detection(
1624 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1625 );
1626
1627 let (converted, clues) =
1628 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1629 let converted = converted.expect("detection should convert");
1630
1631 assert_eq!(
1632 converted.matches[0].rule_url.as_deref(),
1633 Some(
1634 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1635 )
1636 );
1637 assert!(clues.is_empty());
1638 }
1639
1640 #[test]
1641 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1642 let detection = make_detection("");
1643
1644 let (converted, clues) =
1645 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1646 let converted = converted.expect("detection should convert");
1647
1648 assert_eq!(converted.matches[0].rule_url, None);
1649 assert!(clues.is_empty());
1650 }
1651
1652 #[test]
1653 fn test_convert_detection_to_model_rounds_match_coverage() {
1654 let mut detection = make_detection("");
1655 detection.matches[0].score = MatchScore::from_percentage(81.82);
1656 detection.matches[0].match_coverage = 33.334;
1657
1658 let (converted, clues) =
1659 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1660 let converted = converted.expect("detection should convert");
1661
1662 assert_eq!(
1663 converted.matches[0].score,
1664 MatchScore::from_percentage(81.82)
1665 );
1666 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1667 assert!(clues.is_empty());
1668 }
1669
1670 #[test]
1671 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1672 let mut detection = make_detection(
1673 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1674 );
1675 detection.license_expression = None;
1676 detection.license_expression_spdx = None;
1677 detection.identifier = None;
1678 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1679 detection.matches[0].license_expression_spdx =
1680 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1681 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1682 detection.matches[0].rule_kind = RuleKind::Clue;
1683
1684 let (converted, clues) = convert_detection_to_model(
1685 &detection,
1686 LicenseScanOptions {
1687 include_text: true,
1688 min_score: 0,
1689 ..LicenseScanOptions::default()
1690 },
1691 "clue text",
1692 None,
1693 );
1694
1695 assert!(converted.is_none());
1696 assert_eq!(clues.len(), 1);
1697 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1698 assert_eq!(
1699 clues[0].license_expression_spdx,
1700 "LicenseRef-scancode-unknown-license-reference"
1701 );
1702 assert_eq!(
1703 clues[0].rule_identifier.as_deref(),
1704 Some("license-clue_1.RULE")
1705 );
1706 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1707 assert_eq!(clues[0].matched_text_diagnostics, None);
1708 }
1709
1710 #[test]
1711 fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1712 let dir = tempdir().expect("tempdir");
1713 let path = dir.path().join("broken.pdf");
1714 fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1715 .expect("write malformed pdf");
1716 let metadata = fs::metadata(&path).expect("metadata");
1717 let progress = ScanProgress::new(ProgressMode::Quiet);
1718
1719 let file_info = process_file(
1720 &path,
1721 &metadata,
1722 &progress,
1723 None,
1724 LicenseScanOptions::default(),
1725 &TextDetectionOptions::default(),
1726 );
1727
1728 assert!(file_info.scan_errors.is_empty());
1729 }
1730
1731 #[test]
1732 fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1733 let started = Instant::now() - Duration::from_secs(2);
1734 let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1735
1736 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1737
1738 assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1739 }
1740
1741 #[test]
1742 fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1743 let started = Instant::now() - Duration::from_secs(2);
1744 let mut scan_errors = Vec::new();
1745
1746 maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1747
1748 assert_eq!(
1749 scan_errors,
1750 vec!["Processing interrupted due to timeout after 1.00 seconds"]
1751 );
1752 }
1753
1754 #[test]
1755 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1756 let text = concat!(
1757 "Reproduction and distribution of this file, with or without modification, are\n",
1758 "permitted in any medium without royalties provided the copyright notice\n",
1759 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1760 );
1761 let index = create_test_index(
1762 &[
1763 ("reproduction", 0),
1764 ("distribution", 1),
1765 ("file", 2),
1766 ("without", 3),
1767 ("modification", 4),
1768 ("permitted", 5),
1769 ("medium", 6),
1770 ("royalties", 7),
1771 ("provided", 8),
1772 ("copyright", 9),
1773 ("notice", 10),
1774 ("preserved", 11),
1775 ("offered", 12),
1776 ("warranties", 13),
1777 ],
1778 14,
1779 );
1780 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1781 let mut detection = make_detection(
1782 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1783 );
1784 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1785 detection.matches[0].license_expression = "fsf-ap".to_string();
1786 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1787 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1788 detection.matches[0].matched_text = None;
1789 detection.matches[0].start_line = LineNumber::ONE;
1790 detection.matches[0].end_line = LineNumber::new(3).unwrap();
1791 detection.matches[0].start_token = 0;
1792 detection.matches[0].end_token = query.tokens.len();
1793 detection.matches[0].coordinates =
1794 MatchCoordinates::query_region(PositionSpan::from_positions(
1795 query
1796 .tokens
1797 .iter()
1798 .enumerate()
1799 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1800 .collect::<Vec<_>>(),
1801 ));
1802 detection.identifier = Some("fsf_ap-test".to_string());
1803
1804 let (converted, clues) = convert_detection_to_model(
1805 &detection,
1806 LicenseScanOptions {
1807 include_text: true,
1808 include_text_diagnostics: true,
1809 include_diagnostics: true,
1810 unknown_licenses: false,
1811 min_score: 0,
1812 },
1813 text,
1814 Some(&query),
1815 );
1816 let converted = converted.expect("detection should convert");
1817
1818 assert!(clues.is_empty());
1819 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1820 assert_eq!(
1821 converted.matches[0].matched_text.as_deref(),
1822 Some(text.trim_end())
1823 );
1824 let diagnostics = converted.matches[0]
1825 .matched_text_diagnostics
1826 .as_deref()
1827 .expect("diagnostics should be present");
1828 assert!(diagnostics.contains('['));
1829 assert!(diagnostics.contains(']'));
1830 assert_ne!(diagnostics, text.trim_end());
1831 }
1832
1833 #[test]
1834 fn test_extract_email_url_information_skips_binary_string_text() {
1835 let mut builder = FileInfoBuilder::default();
1836 let options = TextDetectionOptions {
1837 collect_info: false,
1838 detect_packages: false,
1839 detect_application_packages: false,
1840 detect_system_packages: false,
1841 detect_packages_in_compiled: false,
1842 detect_copyrights: false,
1843 detect_generated: false,
1844 detect_emails: true,
1845 detect_urls: true,
1846 max_emails: 50,
1847 max_urls: 50,
1848 timeout_seconds: 120.0,
1849 };
1850
1851 extract_email_url_information(
1852 &mut builder,
1853 "contact 6h@fo.lwft and visit http://gmail.com/",
1854 &options,
1855 true,
1856 );
1857
1858 let file = builder
1859 .name("binary.bin".to_string())
1860 .base_name("binary".to_string())
1861 .extension(".bin".to_string())
1862 .path("binary.bin".to_string())
1863 .file_type(FileType::File)
1864 .size(1)
1865 .build()
1866 .expect("builder should produce file info");
1867
1868 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1869 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1870 }
1871
1872 #[test]
1873 fn test_extract_email_url_information_keeps_good_binary_contacts() {
1874 let mut builder = FileInfoBuilder::default();
1875 let options = TextDetectionOptions {
1876 collect_info: false,
1877 detect_packages: false,
1878 detect_application_packages: false,
1879 detect_system_packages: false,
1880 detect_packages_in_compiled: false,
1881 detect_copyrights: false,
1882 detect_generated: false,
1883 detect_emails: true,
1884 detect_urls: true,
1885 max_emails: 50,
1886 max_urls: 50,
1887 timeout_seconds: 120.0,
1888 };
1889
1890 extract_email_url_information(
1891 &mut builder,
1892 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1893 &options,
1894 true,
1895 );
1896
1897 let file = builder
1898 .name("binary.bin".to_string())
1899 .base_name("binary".to_string())
1900 .extension(".bin".to_string())
1901 .path("binary.bin".to_string())
1902 .file_type(FileType::File)
1903 .size(1)
1904 .build()
1905 .expect("builder should produce file info");
1906
1907 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1908 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1909 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1910 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1911 }
1912
1913 #[test]
1914 fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1915 let mut builder = FileInfoBuilder::default();
1916 let options = TextDetectionOptions {
1917 collect_info: false,
1918 detect_packages: false,
1919 detect_application_packages: false,
1920 detect_system_packages: false,
1921 detect_packages_in_compiled: false,
1922 detect_copyrights: false,
1923 detect_generated: false,
1924 detect_emails: true,
1925 detect_urls: false,
1926 max_emails: 2,
1927 max_urls: 50,
1928 timeout_seconds: 120.0,
1929 };
1930
1931 extract_email_url_information(
1932 &mut builder,
1933 "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1934 &options,
1935 true,
1936 );
1937
1938 let file = builder
1939 .name("binary.bin".to_string())
1940 .base_name("binary".to_string())
1941 .extension(".bin".to_string())
1942 .path("binary.bin".to_string())
1943 .file_type(FileType::File)
1944 .size(1)
1945 .build()
1946 .expect("builder should produce file info");
1947
1948 assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1949 assert_eq!(file.emails[0].email, "jakub@redhat.com");
1950 assert_eq!(file.emails[1].email, "contyk@redhat.com");
1951 }
1952
1953 #[test]
1954 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1955 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1956 assert!(!is_binary_string_copyright_candidate(gibberish));
1957 }
1958
1959 #[test]
1960 fn test_binary_string_copyright_candidate_keeps_real_notice() {
1961 let notice = "Copyright nexB and others (c) 2012";
1962 assert!(is_binary_string_copyright_candidate(notice));
1963 }
1964
1965 #[test]
1966 fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1967 assert!(!is_binary_string_copyright_candidate(
1968 "Copyright - split out libs"
1969 ));
1970 }
1971
1972 #[test]
1973 fn test_binary_string_email_candidate_rejects_gibberish() {
1974 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1975 }
1976
1977 #[test]
1978 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1979 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1980 }
1981
1982 #[test]
1983 fn test_binary_string_url_candidate_rejects_short_fake_host() {
1984 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1985 }
1986
1987 #[test]
1988 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1989 assert!(is_binary_string_url_candidate(
1990 "https://www.gnu.org/software/coreutils/"
1991 ));
1992 }
1993
1994 #[test]
1995 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1996 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1997 }
1998
1999 #[test]
2000 fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2001 assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2002 }
2003
2004 #[test]
2005 fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2006 assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2007 }
2008
2009 #[test]
2010 fn test_binary_string_url_candidate_keeps_short_project_path() {
2011 assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2012 }
2013
2014 #[test]
2015 fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2016 assert!(is_binary_string_author_candidate(
2017 "Andreas Schneider <asn@redhat.com>"
2018 ));
2019 }
2020
2021 #[test]
2022 fn test_binary_string_author_candidate_rejects_gibberish() {
2023 assert!(!is_binary_string_author_candidate(
2024 "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2025 ));
2026 }
2027
2028 #[test]
2029 fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2030 assert!(!is_binary_string_author_candidate(
2031 "Developers can enable them. - revert news user back to"
2032 ));
2033 }
2034
2035 #[test]
2036 fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2037 assert_eq!(
2038 extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2039 Some("Andreas Schneider <asn@redhat.com>".to_string())
2040 );
2041 }
2042
2043 #[test]
2044 fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2045 assert_eq!(
2046 extract_named_author_from_binary_line(
2047 "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2048 ),
2049 Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2050 );
2051 }
2052
2053 #[test]
2054 fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2055 assert_eq!(
2056 extract_named_author_from_binary_line(
2057 "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2058 ),
2059 None
2060 );
2061 }
2062
2063 #[test]
2064 fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2065 assert_eq!(
2066 extract_named_author_from_binary_line(
2067 "Changes as per initial review by panemade@gmail.com"
2068 ),
2069 Some("panemade@gmail.com".to_string())
2070 );
2071 }
2072
2073 #[test]
2074 fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2075 assert!(!is_binary_string_author_candidate(
2076 "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2077 ));
2078 }
2079
2080 #[test]
2081 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2082 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2083 let text = "alpha MIT omega";
2084 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2085 let mut detection = make_detection("");
2086 detection.matches[0].coordinates =
2087 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2088 detection.matches[0].start_token = 1;
2089 detection.matches[0].end_token = 2;
2090
2091 let percentage = compute_percentage_of_license_text(&query, &[detection]);
2092
2093 assert_eq!(percentage, 33.33);
2094 }
2095
2096 #[test]
2097 fn test_scan_options_fingerprint_changes_with_license_score() {
2098 let text_options = crate::scanner::TextDetectionOptions::default();
2099 let default_fingerprint = scan_options_fingerprint(
2100 &text_options,
2101 LicenseScanOptions {
2102 min_score: 0,
2103 ..LicenseScanOptions::default()
2104 },
2105 None,
2106 );
2107 let filtered_fingerprint = scan_options_fingerprint(
2108 &text_options,
2109 LicenseScanOptions {
2110 min_score: 70,
2111 ..LicenseScanOptions::default()
2112 },
2113 None,
2114 );
2115
2116 assert_ne!(default_fingerprint, filtered_fingerprint);
2117 }
2118
2119 #[test]
2120 fn test_is_go_non_production_source_for_test_filename() {
2121 let temp_dir = tempdir().unwrap();
2122 let path = temp_dir.path().join("scanner_test.go");
2123 fs::write(&path, "package scanner\n").unwrap();
2124
2125 assert!(is_go_non_production_source(&path).unwrap());
2126 }
2127
2128 #[test]
2129 fn test_is_go_non_production_source_for_build_tag() {
2130 let temp_dir = tempdir().unwrap();
2131 let path = temp_dir.path().join("scanner.go");
2132 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2133
2134 assert!(is_go_non_production_source(&path).unwrap());
2135 }
2136
2137 #[test]
2138 fn test_is_go_non_production_source_for_regular_go_file() {
2139 let temp_dir = tempdir().unwrap();
2140 let path = temp_dir.path().join("scanner.go");
2141 fs::write(&path, "package scanner\n").unwrap();
2142
2143 assert!(!is_go_non_production_source(&path).unwrap());
2144 }
2145}