1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::collections::HashSet;
10use std::fs::{self, File};
11use std::io::{Read, Write};
12use std::path::Path;
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15
16use crate::copyright::{
17 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::license_detection::PositionSet;
21use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
22use crate::license_detection::query::Query;
23use crate::models::{
24 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
25 Match, OutputEmail, OutputURL,
26};
27use crate::parsers::utils::split_name_email;
28use crate::progress::ScanProgress;
29use crate::scanner::collect::CollectedPaths;
30use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
31use crate::utils::file::{
32 ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
33};
34use crate::utils::generated::generated_code_hints_from_bytes;
35use tempfile::TempDir;
36
37const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
38 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
39 (
40 "-----BEGIN TRUSTED CERTIFICATE-----",
41 "-----END TRUSTED CERTIFICATE-----",
42 ),
43];
44
45pub fn process_collected(
46 collected: &CollectedPaths,
47 progress: Arc<ScanProgress>,
48 license_engine: Option<Arc<LicenseDetectionEngine>>,
49 license_options: LicenseScanOptions,
50 text_options: &TextDetectionOptions,
51) -> ProcessResult {
52 let mut all_files: Vec<FileInfo> = collected
53 .files
54 .par_iter()
55 .map(|(path, metadata)| {
56 let file_entry = process_file(
57 path,
58 metadata,
59 progress.as_ref(),
60 license_engine.clone(),
61 license_options,
62 text_options,
63 );
64 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
65 file_entry
66 })
67 .collect();
68
69 for (path, metadata) in &collected.directories {
70 all_files.push(process_directory(
71 path,
72 metadata,
73 text_options.collect_info,
74 license_engine.is_some(),
75 ));
76 }
77
78 ProcessResult {
79 files: all_files,
80 excluded_count: collected.excluded_count,
81 }
82}
83
84pub fn process_collected_with_memory_limit(
85 collected: &CollectedPaths,
86 progress: Arc<ScanProgress>,
87 license_engine: Option<Arc<LicenseDetectionEngine>>,
88 license_options: LicenseScanOptions,
89 text_options: &TextDetectionOptions,
90 max_in_memory: i64,
91) -> ProcessResult {
92 if max_in_memory == 0 {
93 return process_collected(
94 collected,
95 progress,
96 license_engine,
97 license_options,
98 text_options,
99 );
100 }
101
102 let memory_limit = if max_in_memory < 0 {
103 0
104 } else {
105 max_in_memory as usize
106 };
107 let chunk_size = if max_in_memory < 0 {
108 256
109 } else {
110 memory_limit.max(1)
111 };
112
113 let mut retained_files = Vec::new();
114 let mut spill_store = None;
115
116 for chunk in collected.files.chunks(chunk_size) {
117 let processed_chunk: Vec<FileInfo> = chunk
118 .par_iter()
119 .map(|(path, metadata)| {
120 let file_entry = process_file(
121 path,
122 metadata,
123 progress.as_ref(),
124 license_engine.clone(),
125 license_options,
126 text_options,
127 );
128 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
129 file_entry
130 })
131 .collect();
132
133 retain_or_spill_chunk(
134 processed_chunk,
135 &mut retained_files,
136 &mut spill_store,
137 memory_limit,
138 );
139 }
140
141 for (path, metadata) in &collected.directories {
142 let entry = process_directory(
143 path,
144 metadata,
145 text_options.collect_info,
146 license_engine.is_some(),
147 );
148 retain_or_spill_chunk(
149 vec![entry],
150 &mut retained_files,
151 &mut spill_store,
152 memory_limit,
153 );
154 }
155
156 if let Some(spill_store) = spill_store {
157 retained_files.extend(spill_store.load_all());
158 }
159
160 ProcessResult {
161 files: retained_files,
162 excluded_count: collected.excluded_count,
163 }
164}
165
166fn retain_or_spill_chunk(
167 chunk: Vec<FileInfo>,
168 retained_files: &mut Vec<FileInfo>,
169 spill_store: &mut Option<FileInfoSpillStore>,
170 memory_limit: usize,
171) {
172 if memory_limit == 0 {
173 spill_store
174 .get_or_insert_with(FileInfoSpillStore::new)
175 .spill(chunk);
176 return;
177 }
178
179 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
180 if remaining_capacity >= chunk.len() && spill_store.is_none() {
181 retained_files.extend(chunk);
182 return;
183 }
184
185 let mut chunk_iter = chunk.into_iter();
186 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
187 let overflow: Vec<FileInfo> = chunk_iter.collect();
188 if !overflow.is_empty() {
189 spill_store
190 .get_or_insert_with(FileInfoSpillStore::new)
191 .spill(overflow);
192 }
193}
194
195struct FileInfoSpillStore {
196 temp_dir: TempDir,
197 batch_index: usize,
198}
199
200impl FileInfoSpillStore {
201 fn new() -> Self {
202 Self {
203 temp_dir: TempDir::new().expect("create spill dir"),
204 batch_index: 0,
205 }
206 }
207
208 fn spill(&mut self, files: Vec<FileInfo>) {
209 let path = self
210 .temp_dir
211 .path()
212 .join(format!("batch-{:06}.json.zst", self.batch_index));
213 self.batch_index += 1;
214
215 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
216 let file = File::create(path).expect("create spill batch file");
217 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
218 encoder
219 .write_all(&payload)
220 .expect("write spilled file batch");
221 encoder.finish().expect("finish spill encoder");
222 }
223
224 fn load_all(self) -> Vec<FileInfo> {
225 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
226 .expect("read spill dir")
227 .filter_map(Result::ok)
228 .map(|entry| entry.path())
229 .collect();
230 paths.sort();
231
232 let mut files = Vec::new();
233 for path in paths {
234 let file = File::open(path).expect("open spill batch");
235 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
236 let mut payload = Vec::new();
237 decoder.read_to_end(&mut payload).expect("read spill batch");
238 let mut batch: Vec<FileInfo> =
239 serde_json::from_slice(&payload).expect("decode spilled file batch");
240 files.append(&mut batch);
241 }
242 files
243 }
244}
245
246fn process_file(
247 path: &Path,
248 metadata: &fs::Metadata,
249 progress: &ScanProgress,
250 license_engine: Option<Arc<LicenseDetectionEngine>>,
251 license_options: LicenseScanOptions,
252 text_options: &TextDetectionOptions,
253) -> FileInfo {
254 let mut scan_errors: Vec<String> = vec![];
255 let mut file_info_builder = FileInfoBuilder::default();
256 let license_enabled = license_engine.is_some();
257
258 let started = Instant::now();
259
260 let mut generated_flag = None;
261 let mut is_source_file = false;
262 match extract_information_from_content(
263 &mut file_info_builder,
264 &mut scan_errors,
265 path,
266 progress,
267 license_engine,
268 license_options,
269 text_options,
270 ) {
271 Ok((is_generated, sha256, is_source)) => {
272 generated_flag = is_generated;
273 is_source_file = is_source;
274 let _ = sha256;
275 }
276 Err(e) => scan_errors.push(e.to_string()),
277 };
278
279 if is_timeout_exceeded(started, text_options.timeout_seconds) {
280 scan_errors.push(format!(
281 "Processing interrupted due to timeout after {:.2} seconds",
282 text_options.timeout_seconds
283 ));
284 }
285
286 let mut file_info = file_info_builder
287 .name(path.file_name().unwrap().to_string_lossy().to_string())
288 .base_name(
289 path.file_stem()
290 .unwrap_or_default()
291 .to_string_lossy()
292 .to_string(),
293 )
294 .extension(
295 path.extension()
296 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
297 )
298 .path(path.to_string_lossy().to_string())
299 .file_type(FileType::File)
300 .size(metadata.len())
301 .date(
302 text_options
303 .collect_info
304 .then(|| get_creation_date(metadata))
305 .flatten(),
306 )
307 .scan_errors(scan_errors)
308 .build()
309 .expect("FileInformationBuild not completely initialized");
310
311 if text_options.collect_info {
312 file_info.is_source = Some(is_source_file);
313 }
314
315 if file_info.programming_language.as_deref() == Some("Go")
316 && is_go_non_production_source(path).unwrap_or(false)
317 {
318 file_info.is_source = Some(false);
319 }
320
321 if text_options.detect_generated {
322 file_info.is_generated = Some(generated_flag.unwrap_or(false));
323 }
324
325 if file_info.percentage_of_license_text.is_none() && license_enabled {
326 file_info.percentage_of_license_text = Some(0.0);
327 }
328
329 file_info
330}
331
332fn extract_information_from_content(
333 file_info_builder: &mut FileInfoBuilder,
334 scan_errors: &mut Vec<String>,
335 path: &Path,
336 progress: &ScanProgress,
337 license_engine: Option<Arc<LicenseDetectionEngine>>,
338 license_options: LicenseScanOptions,
339 text_options: &TextDetectionOptions,
340) -> Result<(Option<bool>, String, bool), Error> {
341 let started = Instant::now();
342 let buffer = fs::read(path)?;
343 let license_enabled = license_engine.is_some();
344
345 if is_timeout_exceeded(started, text_options.timeout_seconds) {
346 return Err(Error::msg(format!(
347 "Timeout while reading file content (> {:.2}s)",
348 text_options.timeout_seconds
349 )));
350 }
351
352 let sha256 = calculate_sha256(&buffer);
353 let is_generated = text_options
354 .detect_generated
355 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
356 let classification = classify_file_info(path, &buffer);
357
358 if text_options.collect_info {
359 file_info_builder
360 .sha1(Some(calculate_sha1(&buffer)))
361 .md5(Some(calculate_md5(&buffer)))
362 .sha256(Some(sha256.clone()))
363 .programming_language(classification.programming_language.clone())
364 .mime_type(Some(classification.mime_type.clone()))
365 .file_type_label(Some(classification.file_type.clone()))
366 .sha1_git(Some(calculate_sha1_git(&buffer)))
367 .is_binary(Some(classification.is_binary))
368 .is_text(Some(classification.is_text))
369 .is_archive(Some(classification.is_archive))
370 .is_media(Some(classification.is_media))
371 .is_source(Some(classification.is_source))
372 .is_script(Some(classification.is_script))
373 .files_count(Some(0))
374 .dirs_count(Some(0))
375 .size_count(Some(0));
376 }
377
378 if should_skip_text_detection(path, &buffer) {
379 return Ok((is_generated, sha256, classification.is_source));
380 }
381
382 if text_options.detect_packages {
385 let started = Instant::now();
386 let parse_result = try_parse_file(path).or_else(|| {
387 text_options
388 .detect_packages_in_compiled
389 .then(|| try_parse_compiled_bytes(&buffer))
390 .flatten()
391 });
392
393 if let Some(parse_result) = parse_result {
394 let packages = parse_result
395 .packages
396 .into_iter()
397 .filter(|package| {
398 let is_compiled_package = package
399 .datasource_id
400 .as_ref()
401 .is_some_and(is_compiled_datasource);
402 let is_system_package = package
403 .datasource_id
404 .as_ref()
405 .is_some_and(is_system_datasource);
406 if is_compiled_package {
407 text_options.detect_packages_in_compiled
408 } else if is_system_package {
409 text_options.detect_system_packages
410 } else {
411 text_options.detect_application_packages
412 }
413 })
414 .collect();
415 file_info_builder.package_data(packages);
416 scan_errors.extend(parse_result.scan_errors);
417 }
418 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
419 }
420
421 if is_timeout_exceeded(started, text_options.timeout_seconds) {
422 return Err(Error::msg(format!(
423 "Timeout while extracting package/text metadata (> {:.2}s)",
424 text_options.timeout_seconds
425 )));
426 }
427
428 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
429 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
430
431 if is_timeout_exceeded(started, text_options.timeout_seconds) {
432 return Err(Error::msg(format!(
433 "Timeout while extracting text content (> {:.2}s)",
434 text_options.timeout_seconds
435 )));
436 }
437
438 if text_content.is_empty() {
439 return Ok((is_generated, sha256, classification.is_source));
440 }
441
442 if text_options.detect_copyrights {
443 extract_copyright_information(
444 file_info_builder,
445 path,
446 &text_content,
447 text_options.timeout_seconds,
448 from_binary_strings,
449 );
450 }
451 extract_email_url_information(
452 file_info_builder,
453 &text_content,
454 text_options,
455 from_binary_strings,
456 );
457
458 if is_timeout_exceeded(started, text_options.timeout_seconds) {
459 return Err(Error::msg(format!(
460 "Timeout before license scan (> {:.2}s)",
461 text_options.timeout_seconds
462 )));
463 }
464 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
466 if let Some(sourcemap_content) =
467 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
468 {
469 sourcemap_content
470 } else {
471 text_content
472 }
473 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
474 remove_verbatim_escape_sequences(&text_content)
475 } else {
476 text_content
477 };
478
479 if license_enabled {
480 let started = Instant::now();
481 extract_license_information(
482 file_info_builder,
483 scan_errors,
484 path,
485 text_content_for_license_detection,
486 license_engine,
487 license_options,
488 from_binary_strings,
489 )?;
490 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
491 } else {
492 extract_license_information(
493 file_info_builder,
494 scan_errors,
495 path,
496 text_content_for_license_detection,
497 license_engine,
498 license_options,
499 from_binary_strings,
500 )?;
501 }
502
503 Ok((is_generated, sha256, classification.is_source))
504}
505
506fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
507 timeout_seconds.is_finite()
508 && timeout_seconds > 0.0
509 && started.elapsed().as_secs_f64() > timeout_seconds
510}
511
512fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
513 matches!(
514 datasource_id,
515 DatasourceId::AlpineInstalledDb
516 | DatasourceId::DebianDistrolessInstalledDb
517 | DatasourceId::DebianInstalledFilesList
518 | DatasourceId::DebianInstalledMd5Sums
519 | DatasourceId::DebianInstalledStatusDb
520 | DatasourceId::FreebsdCompactManifest
521 | DatasourceId::RpmInstalledDatabaseBdb
522 | DatasourceId::RpmInstalledDatabaseNdb
523 | DatasourceId::RpmInstalledDatabaseSqlite
524 | DatasourceId::RpmYumdb
525 )
526}
527
528fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
529 matches!(
530 datasource_id,
531 DatasourceId::GoBinary | DatasourceId::RustBinary
532 )
533}
534
535fn extract_copyright_information(
536 file_info_builder: &mut FileInfoBuilder,
537 path: &Path,
538 text_content: &str,
539 timeout_seconds: f64,
540 from_binary_strings: bool,
541) {
542 if copyright::is_credits_file(path) {
544 let author_detections = copyright::detect_credits_authors(text_content);
545 if !author_detections.is_empty() {
546 file_info_builder.authors(
547 author_detections
548 .into_iter()
549 .map(|a| Author {
550 author: a.author,
551 start_line: a.start_line,
552 end_line: a.end_line,
553 })
554 .collect(),
555 );
556 return;
557 }
558 }
559
560 let copyright_options = CopyrightDetectionOptions {
561 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
562 Some(Duration::from_secs_f64(timeout_seconds))
563 } else {
564 None
565 },
566 ..CopyrightDetectionOptions::default()
567 };
568
569 let (copyrights, holders, authors) =
570 copyright::detect_copyrights_with_options(text_content, ©right_options);
571 let (copyrights, holders, authors) = if from_binary_strings {
572 prune_binary_string_detections(text_content, copyrights, holders, authors)
573 } else {
574 (copyrights, holders, authors)
575 };
576
577 file_info_builder.copyrights(
578 copyrights
579 .into_iter()
580 .map(|c| Copyright {
581 copyright: c.copyright,
582 start_line: c.start_line,
583 end_line: c.end_line,
584 })
585 .collect::<Vec<Copyright>>(),
586 );
587 file_info_builder.holders(
588 holders
589 .into_iter()
590 .map(|h| Holder {
591 holder: h.holder,
592 start_line: h.start_line,
593 end_line: h.end_line,
594 })
595 .collect::<Vec<Holder>>(),
596 );
597 file_info_builder.authors(
598 authors
599 .into_iter()
600 .map(|a| Author {
601 author: a.author,
602 start_line: a.start_line,
603 end_line: a.end_line,
604 })
605 .collect::<Vec<Author>>(),
606 );
607}
608
609fn prune_binary_string_detections(
610 text_content: &str,
611 copyrights: Vec<CopyrightDetection>,
612 holders: Vec<HolderDetection>,
613 authors: Vec<AuthorDetection>,
614) -> (
615 Vec<CopyrightDetection>,
616 Vec<HolderDetection>,
617 Vec<AuthorDetection>,
618) {
619 let kept_copyrights: Vec<CopyrightDetection> = copyrights
620 .into_iter()
621 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
622 .collect();
623
624 let kept_holders: Vec<HolderDetection> = holders
625 .into_iter()
626 .filter(|holder| {
627 kept_copyrights.iter().any(|copyright| {
628 ranges_overlap(
629 holder.start_line,
630 holder.end_line,
631 copyright.start_line,
632 copyright.end_line,
633 )
634 })
635 })
636 .collect();
637
638 let kept_authors = authors
639 .into_iter()
640 .filter(|author| is_binary_string_author_candidate(&author.author))
641 .chain(extract_binary_string_author_supplements(text_content))
642 .filter({
643 let mut seen = HashSet::new();
644 move |author| seen.insert(author.author.clone())
645 })
646 .collect();
647
648 (kept_copyrights, kept_holders, kept_authors)
649}
650
651fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
652 a_start <= b_end && b_start <= a_end
653}
654
655fn is_binary_string_copyright_candidate(text: &str) -> bool {
656 if contains_year(text) {
657 return true;
658 }
659
660 let trimmed = text.trim();
661 let lower = trimmed.to_ascii_lowercase();
662 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
663 tail.trim()
664 } else {
665 lower.trim()
666 };
667 let original_tail = if lower.starts_with("copyright") {
668 trimmed["copyright".len()..].trim()
669 } else {
670 trimmed
671 };
672
673 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
674 return false;
675 }
676
677 let alpha_tokens: Vec<&str> = tail
678 .split_whitespace()
679 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
680 .collect();
681
682 if alpha_tokens.len() <= 1 {
683 return has_explicit_copyright_marker(text)
684 && alpha_tokens.iter().any(|token| {
685 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
686 });
687 }
688
689 if !has_explicit_copyright_marker(text) {
690 return false;
691 }
692
693 has_binary_name_like_shape(original_tail)
694}
695
696fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
697 let mut authors = Vec::new();
698
699 for (line_index, line) in text_content.lines().enumerate() {
700 if let Some(author) = extract_named_author_from_binary_line(line) {
701 authors.push(AuthorDetection {
702 author,
703 start_line: line_index + 1,
704 end_line: line_index + 1,
705 });
706 }
707 }
708
709 authors
710}
711
712fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
713 let line = line.trim();
714 if line.is_empty() {
715 return None;
716 }
717
718 let emails = finder::find_emails(
719 line,
720 &DetectionConfig {
721 max_emails: 4,
722 max_urls: 0,
723 unique: false,
724 },
725 );
726 let email = emails.first()?.email.as_str();
727 if !is_binary_string_email_candidate(email) {
728 return None;
729 }
730
731 let lower_line = line.to_ascii_lowercase();
732 let email_start = lower_line.find(email)?;
733 let raw_prefix = &line[..email_start];
734 let has_author_marker = contains_binary_author_marker(raw_prefix);
735 let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
736 let prefix = prefix
737 .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
738 .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
739 .trim();
740
741 let (name, _) = split_name_email(prefix);
742 let name = name.or_else(|| {
743 let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
744 (!trimmed.is_empty()).then(|| trimmed.to_string())
745 });
746
747 let Some(name) = name.map(|name| name.trim().to_string()) else {
748 if has_author_marker {
749 return Some(email.to_string());
750 }
751 return None;
752 };
753
754 if name.is_empty() && has_author_marker {
755 return Some(email.to_string());
756 }
757
758 if !has_binary_name_like_shape(&name) {
759 return None;
760 }
761
762 if line.contains(&format!("<{email}>")) {
763 Some(format!("{name} <{email}>"))
764 } else if line.contains(&format!("({email})")) {
765 Some(format!("{name} ({email})"))
766 } else {
767 Some(format!("{name} {email}"))
768 }
769}
770
771fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
772 let lower = text.to_ascii_lowercase();
773 let idx = lower.rfind(marker)?;
774 Some(text[idx + marker.len()..].trim())
775}
776
777fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
778 const MARKERS: &[&str] = &[
779 " patch author: ",
780 " patch author ",
781 " written by ",
782 " contributed by ",
783 " original work done by ",
784 " work done by ",
785 " thanks to ",
786 " review by ",
787 " by ",
788 " from ",
789 ];
790
791 MARKERS
792 .iter()
793 .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
794 .next()
795}
796
797fn contains_binary_author_marker(text: &str) -> bool {
798 take_suffix_after_last_author_marker(text).is_some()
799}
800
801fn has_binary_name_like_shape(text: &str) -> bool {
802 let trimmed = text.trim();
803 if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
804 {
805 return false;
806 }
807
808 let tokens: Vec<&str> = trimmed
809 .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
810 .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
811 .collect();
812 if tokens.is_empty() {
813 return false;
814 }
815
816 let uppercase_like = tokens
817 .iter()
818 .filter(|token| {
819 let token = token.trim_matches('.');
820 token
821 .chars()
822 .find(|c| c.is_ascii_alphabetic())
823 .is_some_and(|c| c.is_ascii_uppercase())
824 })
825 .count();
826
827 uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
828 || tokens
829 .iter()
830 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
831}
832
833fn has_sufficient_alphabetic_content(text: &str) -> bool {
834 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
835 if alnum_count == 0 {
836 return false;
837 }
838
839 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
840 alpha_count * 2 >= alnum_count
841}
842
843fn has_excessive_at_noise(text: &str) -> bool {
844 text.chars().filter(|c| *c == '@').count() >= 3
845}
846
847fn has_explicit_copyright_marker(text: &str) -> bool {
848 let lower = text.to_ascii_lowercase();
849 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
850}
851
852fn contains_year(text: &str) -> bool {
853 let bytes = text.as_bytes();
854 bytes.windows(4).any(|window| {
855 window.iter().all(|b| b.is_ascii_digit())
856 && matches!(window[0], b'1' | b'2')
857 && matches!(window[1], b'9' | b'0')
858 })
859}
860
861fn is_company_like_suffix(token: &str) -> bool {
862 matches!(
863 token.to_ascii_lowercase().as_str(),
864 "inc"
865 | "corp"
866 | "corporation"
867 | "co"
868 | "company"
869 | "ltd"
870 | "llc"
871 | "gmbh"
872 | "foundation"
873 | "project"
874 | "systems"
875 | "software"
876 | "technologies"
877 | "technology"
878 )
879}
880
881fn extract_email_url_information(
882 file_info_builder: &mut FileInfoBuilder,
883 text_content: &str,
884 text_options: &TextDetectionOptions,
885 from_binary_strings: bool,
886) {
887 if !text_options.detect_emails && !text_options.detect_urls {
888 return;
889 }
890
891 if text_options.detect_emails {
892 let config = DetectionConfig {
893 max_emails: text_options.max_emails,
894 max_urls: text_options.max_urls,
895 unique: from_binary_strings,
896 };
897 let emails = finder::find_emails(text_content, &config)
898 .into_iter()
899 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
900 .map(|d| OutputEmail {
901 email: d.email,
902 start_line: d.start_line,
903 end_line: d.end_line,
904 })
905 .collect::<Vec<_>>();
906 file_info_builder.emails(emails);
907 }
908
909 if text_options.detect_urls {
910 let config = DetectionConfig {
911 max_emails: text_options.max_emails,
912 max_urls: text_options.max_urls,
913 unique: true,
914 };
915 let urls = finder::find_urls(text_content, &config)
916 .into_iter()
917 .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
918 .map(|d| OutputURL {
919 url: d.url,
920 start_line: d.start_line,
921 end_line: d.end_line,
922 })
923 .collect::<Vec<_>>();
924 file_info_builder.urls(urls);
925 }
926}
927
928fn is_binary_string_email_candidate(email: &str) -> bool {
929 let Some((local, domain)) = email.rsplit_once('@') else {
930 return false;
931 };
932
933 if !has_strong_binary_local_part(local) {
934 return false;
935 }
936
937 has_strong_binary_host_shape(domain)
938}
939
940fn is_binary_string_url_candidate(url: &str) -> bool {
941 let parsed = url::Url::parse(url).ok();
942 let Some(parsed) = parsed else {
943 return false;
944 };
945 let Some(host) = parsed.host_str() else {
946 return false;
947 };
948
949 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
950}
951
952fn is_binary_string_author_candidate(author: &str) -> bool {
953 let trimmed = author.trim();
954 if trimmed.is_empty()
955 || !has_sufficient_alphabetic_content(trimmed)
956 || has_excessive_at_noise(trimmed)
957 {
958 return false;
959 }
960
961 if trimmed.contains('@') {
962 let emails = finder::find_emails(
963 trimmed,
964 &DetectionConfig {
965 max_emails: 4,
966 max_urls: 0,
967 unique: true,
968 },
969 );
970 if emails.len() > 1 {
971 return false;
972 }
973
974 if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
975 return !extracted.is_empty();
976 }
977
978 let Some(email) = emails.first().map(|d| d.email.as_str()) else {
979 return false;
980 };
981 if !is_binary_string_email_candidate(email) {
982 return false;
983 }
984
985 let (name, _) = split_name_email(trimmed);
986 return name.as_deref().is_some_and(has_binary_name_like_shape);
987 }
988
989 has_binary_name_like_shape(trimmed)
990}
991
992fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
993 if parsed.path() != "/"
994 && parsed
995 .path()
996 .split('/')
997 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
998 {
999 return true;
1000 }
1001
1002 if parsed.query().is_some() || parsed.fragment().is_some() {
1003 return true;
1004 }
1005
1006 let Some(host) = parsed.host_str() else {
1007 return false;
1008 };
1009
1010 let labels: Vec<&str> = host.split('.').collect();
1011 if labels.len() > 2 {
1012 return labels[..labels.len() - 1].iter().any(|label| {
1013 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1014 });
1015 }
1016
1017 if matches!(labels.first(), Some(&"www")) {
1018 return true;
1019 }
1020
1021 if labels.len() == 2 {
1022 let domain = labels[0];
1023 let tld = labels[1];
1024 if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1025 return true;
1026 }
1027 }
1028
1029 labels
1030 .iter()
1031 .take(labels.len().saturating_sub(1))
1032 .any(|label| {
1033 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1034 })
1035}
1036
1037fn has_strong_binary_local_part(local: &str) -> bool {
1038 local
1039 .split(|c: char| !c.is_ascii_alphabetic())
1040 .any(|segment| segment.len() >= 3)
1041}
1042
1043fn has_strong_binary_host_shape(host: &str) -> bool {
1044 let labels: Vec<&str> = host.split('.').collect();
1045 if labels.len() < 2 {
1046 return false;
1047 }
1048
1049 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1050 &labels[1..]
1051 } else {
1052 &labels[..]
1053 };
1054
1055 if relevant.len() < 2 {
1056 return false;
1057 }
1058
1059 relevant[..relevant.len() - 1].iter().any(|label| {
1060 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1061 })
1062}
1063
1064fn extract_license_information(
1065 file_info_builder: &mut FileInfoBuilder,
1066 scan_errors: &mut Vec<String>,
1067 path: &Path,
1068 text_content: String,
1069 license_engine: Option<Arc<LicenseDetectionEngine>>,
1070 license_options: LicenseScanOptions,
1071 from_binary_strings: bool,
1072) -> Result<(), Error> {
1073 let Some(engine) = license_engine else {
1074 return Ok(());
1075 };
1076
1077 let detection_result = if license_options.min_score == 0 {
1078 engine.detect_with_kind_and_source(
1079 &text_content,
1080 license_options.unknown_licenses,
1081 from_binary_strings,
1082 &path.to_string_lossy(),
1083 )
1084 } else {
1085 engine.detect_with_kind_and_source_with_score(
1086 &text_content,
1087 license_options.unknown_licenses,
1088 from_binary_strings,
1089 &path.to_string_lossy(),
1090 license_options.min_score as f32,
1091 )
1092 };
1093
1094 match detection_result {
1095 Ok(detections) => {
1096 let query =
1097 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1098 let mut model_detections = Vec::new();
1099 let mut model_clues = Vec::new();
1100
1101 for detection in &detections {
1102 let (public_detection, clue_matches) = convert_detection_to_model(
1103 detection,
1104 license_options,
1105 &text_content,
1106 query.as_ref(),
1107 );
1108
1109 if let Some(public_detection) = public_detection {
1110 model_detections.push(public_detection);
1111 }
1112
1113 model_clues.extend(clue_matches);
1114 }
1115
1116 if !model_detections.is_empty() {
1117 let expressions: Vec<String> = model_detections
1118 .iter()
1119 .filter(|d| !d.license_expression_spdx.is_empty())
1120 .map(|d| d.license_expression_spdx.clone())
1121 .collect();
1122
1123 if !expressions.is_empty() {
1124 let combined = crate::utils::spdx::combine_license_expressions(expressions);
1125 if let Some(expr) = combined {
1126 file_info_builder.license_expression(Some(expr));
1127 }
1128 }
1129 }
1130
1131 file_info_builder.license_detections(model_detections);
1132 file_info_builder.license_clues(model_clues);
1133 file_info_builder.percentage_of_license_text(
1134 query
1135 .as_ref()
1136 .map(|query| compute_percentage_of_license_text(query, &detections)),
1137 );
1138 }
1139 Err(e) => {
1140 scan_errors.push(format!("License detection failed: {}", e));
1141 }
1142 }
1143
1144 Ok(())
1145}
1146
1147fn convert_detection_to_model(
1148 detection: &crate::license_detection::LicenseDetection,
1149 license_options: LicenseScanOptions,
1150 text_content: &str,
1151 query: Option<&Query<'_>>,
1152) -> (Option<LicenseDetection>, Vec<Match>) {
1153 let matches: Vec<Match> = detection
1154 .matches
1155 .iter()
1156 .map(|m| convert_match_to_model(m, license_options, text_content, query))
1157 .collect();
1158
1159 if let Some(license_expression) = detection.license_expression.clone() {
1160 (
1161 Some(LicenseDetection {
1162 license_expression,
1163 license_expression_spdx: detection
1164 .license_expression_spdx
1165 .clone()
1166 .unwrap_or_default(),
1167 matches,
1168 detection_log: if license_options.include_diagnostics {
1169 detection.detection_log.clone()
1170 } else {
1171 Vec::new()
1172 },
1173 identifier: detection.identifier.clone(),
1174 }),
1175 Vec::new(),
1176 )
1177 } else {
1178 (None, matches)
1179 }
1180}
1181
1182fn convert_match_to_model(
1183 m: &crate::license_detection::models::LicenseMatch,
1184 license_options: LicenseScanOptions,
1185 text_content: &str,
1186 query: Option<&Query<'_>>,
1187) -> Match {
1188 let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
1189 let rule_url = if m.rule_url.is_empty() {
1190 None
1191 } else {
1192 Some(m.rule_url.clone())
1193 };
1194 let matched_text = if license_options.include_text {
1195 m.matched_text.clone().or_else(|| {
1196 Some(crate::license_detection::query::matched_text_from_text(
1197 text_content,
1198 m.start_line,
1199 m.end_line,
1200 ))
1201 })
1202 } else {
1203 None
1204 };
1205 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1206 query.map(|query| matched_text_diagnostics_from_match(query, m))
1207 } else {
1208 None
1209 };
1210 Match {
1211 license_expression: m.license_expression.clone(),
1212 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1213 from_file: m.from_file.clone(),
1214 start_line: m.start_line,
1215 end_line: m.end_line,
1216 matcher: Some(m.matcher.to_string()),
1217 score: output_metric(m.score),
1218 matched_length: Some(m.matched_length),
1219 match_coverage: Some(output_metric(m.coverage())),
1220 rule_relevance: Some(m.rule_relevance as usize),
1221 rule_identifier: Some(m.rule_identifier.clone()),
1222 rule_url,
1223 matched_text,
1224 referenced_filenames: m.referenced_filenames.clone(),
1225 matched_text_diagnostics,
1226 }
1227}
1228
1229fn compute_percentage_of_license_text(
1230 query: &Query<'_>,
1231 detections: &[crate::license_detection::LicenseDetection],
1232) -> f64 {
1233 let matched_positions: std::collections::HashSet<usize> = detections
1234 .iter()
1235 .flat_map(|detection| detection.matches.iter())
1236 .flat_map(|m| m.query_span().iter())
1237 .collect();
1238
1239 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1240 if query_tokens_length == 0 {
1241 return 0.0;
1242 }
1243
1244 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1245 (percentage * 100.0).round() / 100.0
1246}
1247
1248fn matched_text_diagnostics_from_match(
1249 query: &Query<'_>,
1250 license_match: &InternalLicenseMatch,
1251) -> String {
1252 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1253 let Some(start_pos) = matched_positions.iter().min() else {
1254 return crate::license_detection::query::matched_text_from_text(
1255 &query.text,
1256 license_match.start_line,
1257 license_match.end_line,
1258 );
1259 };
1260 let Some(end_pos) = matched_positions.iter().max() else {
1261 return crate::license_detection::query::matched_text_from_text(
1262 &query.text,
1263 license_match.start_line,
1264 license_match.end_line,
1265 );
1266 };
1267
1268 crate::license_detection::query::matched_text_diagnostics_from_text(
1269 &query.text,
1270 query,
1271 &matched_positions,
1272 start_pos,
1273 end_pos,
1274 license_match.start_line,
1275 license_match.end_line,
1276 )
1277}
1278
1279fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1280 is_pem_certificate_file(path, buffer)
1281}
1282
1283fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1284 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1285 return Ok(false);
1286 }
1287
1288 if path
1289 .file_name()
1290 .and_then(|name| name.to_str())
1291 .is_some_and(|name| name.ends_with("_test.go"))
1292 {
1293 return Ok(true);
1294 }
1295
1296 let content = fs::read_to_string(path)?;
1297 Ok(content.lines().take(10).any(|line| {
1298 let trimmed = line.trim();
1299 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1300 && trimmed.split_whitespace().any(|token| token == "test")
1301 }))
1302}
1303
1304fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1305 let prefix_len = buffer.len().min(8192);
1306 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1307 let trimmed_lines: Vec<&str> = prefix
1308 .lines()
1309 .map(str::trim)
1310 .filter(|line| !line.is_empty())
1311 .take(64)
1312 .collect();
1313
1314 let Some(first_line) = trimmed_lines.first().copied() else {
1315 return false;
1316 };
1317
1318 PEM_CERTIFICATE_HEADERS
1319 .iter()
1320 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1321}
1322
1323fn process_directory(
1324 path: &Path,
1325 _metadata: &fs::Metadata,
1326 collect_info: bool,
1327 license_enabled: bool,
1328) -> FileInfo {
1329 let name = path
1330 .file_name()
1331 .unwrap_or_default()
1332 .to_string_lossy()
1333 .to_string();
1334 let base_name = name.clone(); FileInfo {
1337 name,
1338 base_name,
1339 extension: "".to_string(),
1340 path: path.to_string_lossy().to_string(),
1341 file_type: FileType::Directory,
1342 mime_type: None,
1343 file_type_label: None,
1344 size: 0,
1345 date: None,
1346 sha1: None,
1347 md5: None,
1348 sha256: None,
1349 sha1_git: None,
1350 programming_language: None,
1351 package_data: Vec::new(),
1352 license_expression: None,
1353 license_detections: Vec::new(),
1354 license_clues: Vec::new(),
1355 percentage_of_license_text: license_enabled.then_some(0.0),
1356 copyrights: Vec::new(),
1357 holders: Vec::new(),
1358 authors: Vec::new(),
1359 emails: Vec::new(),
1360 urls: Vec::new(),
1361 for_packages: Vec::new(),
1362 scan_errors: Vec::new(),
1363 license_policy: None,
1364 is_binary: collect_info.then_some(false),
1365 is_text: collect_info.then_some(false),
1366 is_archive: collect_info.then_some(false),
1367 is_media: collect_info.then_some(false),
1368 is_source: collect_info.then_some(false),
1369 is_script: collect_info.then_some(false),
1370 files_count: collect_info.then_some(0),
1371 dirs_count: collect_info.then_some(0),
1372 size_count: collect_info.then_some(0),
1373 source_count: None,
1374 is_legal: false,
1375 is_manifest: false,
1376 is_readme: false,
1377 is_top_level: false,
1378 is_key_file: false,
1379 is_community: false,
1380 is_generated: None,
1381 facets: vec![],
1382 tallies: None,
1383 }
1384}
1385
1386#[cfg(test)]
1387mod tests {
1388 use super::{
1389 compute_percentage_of_license_text, convert_detection_to_model,
1390 extract_email_url_information, extract_named_author_from_binary_line,
1391 is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1392 is_binary_string_email_candidate, is_binary_string_url_candidate,
1393 is_go_non_production_source,
1394 };
1395 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1396 use crate::license_detection::index::LicenseIndex;
1397 use crate::license_detection::index::dictionary::TokenDictionary;
1398 use crate::license_detection::models::position_span::PositionSpan;
1399 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1400 use crate::license_detection::query::Query;
1401 use crate::models::{FileInfoBuilder, FileType};
1402 use crate::scanner::scan_options_fingerprint;
1403 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1404 use std::fs;
1405 use tempfile::tempdir;
1406
1407 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1408 LicenseMatch {
1409 rid: 0,
1410 license_expression: "mit".to_string(),
1411 license_expression_spdx: Some("MIT".to_string()),
1412 from_file: None,
1413 start_line: 1,
1414 end_line: 1,
1415 start_token: 0,
1416 end_token: 1,
1417 matcher: MatcherKind::Hash,
1418 score: 1.0,
1419 matched_length: 3,
1420 rule_length: 3,
1421 match_coverage: 100.0,
1422 rule_relevance: 100,
1423 rule_identifier: "mit.LICENSE".to_string(),
1424 rule_url: rule_url.to_string(),
1425 matched_text: Some("MIT".to_string()),
1426 referenced_filenames: None,
1427 rule_kind: RuleKind::Text,
1428 is_from_license: true,
1429 rule_start_token: 0,
1430 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1431 candidate_resemblance: 0.0,
1432 candidate_containment: 0.0,
1433 }
1434 }
1435
1436 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1437 InternalLicenseDetection {
1438 license_expression: Some("mit".to_string()),
1439 license_expression_spdx: Some("MIT".to_string()),
1440 matches: vec![make_internal_match(rule_url)],
1441 detection_log: vec![],
1442 identifier: Some("mit-test".to_string()),
1443 file_regions: Vec::new(),
1444 }
1445 }
1446
1447 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1448 let dictionary = TokenDictionary::new_with_legalese(entries);
1449 let mut index = LicenseIndex::new(dictionary);
1450 index.len_legalese = len_legalese;
1451 index
1452 }
1453
1454 #[test]
1455 fn test_convert_detection_to_model_preserves_rule_url() {
1456 let detection = make_detection(
1457 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1458 );
1459
1460 let (converted, clues) =
1461 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1462 let converted = converted.expect("detection should convert");
1463
1464 assert_eq!(
1465 converted.matches[0].rule_url.as_deref(),
1466 Some(
1467 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1468 )
1469 );
1470 assert!(clues.is_empty());
1471 }
1472
1473 #[test]
1474 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1475 let detection = make_detection("");
1476
1477 let (converted, clues) =
1478 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1479 let converted = converted.expect("detection should convert");
1480
1481 assert_eq!(converted.matches[0].rule_url, None);
1482 assert!(clues.is_empty());
1483 }
1484
1485 #[test]
1486 fn test_convert_detection_to_model_rounds_match_coverage() {
1487 let mut detection = make_detection("");
1488 detection.matches[0].score = 81.82;
1489 detection.matches[0].match_coverage = 33.334;
1490
1491 let (converted, clues) =
1492 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1493 let converted = converted.expect("detection should convert");
1494
1495 assert_eq!(converted.matches[0].score, 81.82);
1496 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1497 assert!(clues.is_empty());
1498 }
1499
1500 #[test]
1501 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1502 let mut detection = make_detection(
1503 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1504 );
1505 detection.license_expression = None;
1506 detection.license_expression_spdx = None;
1507 detection.identifier = None;
1508 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1509 detection.matches[0].license_expression_spdx =
1510 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1511 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1512 detection.matches[0].rule_kind = RuleKind::Clue;
1513
1514 let (converted, clues) = convert_detection_to_model(
1515 &detection,
1516 LicenseScanOptions {
1517 include_text: true,
1518 min_score: 0,
1519 ..LicenseScanOptions::default()
1520 },
1521 "clue text",
1522 None,
1523 );
1524
1525 assert!(converted.is_none());
1526 assert_eq!(clues.len(), 1);
1527 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1528 assert_eq!(
1529 clues[0].license_expression_spdx,
1530 "LicenseRef-scancode-unknown-license-reference"
1531 );
1532 assert_eq!(
1533 clues[0].rule_identifier.as_deref(),
1534 Some("license-clue_1.RULE")
1535 );
1536 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1537 assert_eq!(clues[0].matched_text_diagnostics, None);
1538 }
1539
1540 #[test]
1541 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1542 let text = concat!(
1543 "Reproduction and distribution of this file, with or without modification, are\n",
1544 "permitted in any medium without royalties provided the copyright notice\n",
1545 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1546 );
1547 let index = create_test_index(
1548 &[
1549 ("reproduction", 0),
1550 ("distribution", 1),
1551 ("file", 2),
1552 ("without", 3),
1553 ("modification", 4),
1554 ("permitted", 5),
1555 ("medium", 6),
1556 ("royalties", 7),
1557 ("provided", 8),
1558 ("copyright", 9),
1559 ("notice", 10),
1560 ("preserved", 11),
1561 ("offered", 12),
1562 ("warranties", 13),
1563 ],
1564 14,
1565 );
1566 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1567 let mut detection = make_detection(
1568 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1569 );
1570 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1571 detection.matches[0].license_expression = "fsf-ap".to_string();
1572 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1573 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1574 detection.matches[0].matched_text = None;
1575 detection.matches[0].start_line = 1;
1576 detection.matches[0].end_line = 3;
1577 detection.matches[0].start_token = 0;
1578 detection.matches[0].end_token = query.tokens.len();
1579 detection.matches[0].coordinates =
1580 MatchCoordinates::query_region(PositionSpan::from_positions(
1581 query
1582 .tokens
1583 .iter()
1584 .enumerate()
1585 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1586 .collect::<Vec<_>>(),
1587 ));
1588 detection.identifier = Some("fsf_ap-test".to_string());
1589
1590 let (converted, clues) = convert_detection_to_model(
1591 &detection,
1592 LicenseScanOptions {
1593 include_text: true,
1594 include_text_diagnostics: true,
1595 include_diagnostics: true,
1596 unknown_licenses: false,
1597 min_score: 0,
1598 },
1599 text,
1600 Some(&query),
1601 );
1602 let converted = converted.expect("detection should convert");
1603
1604 assert!(clues.is_empty());
1605 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1606 assert_eq!(
1607 converted.matches[0].matched_text.as_deref(),
1608 Some(text.trim_end())
1609 );
1610 let diagnostics = converted.matches[0]
1611 .matched_text_diagnostics
1612 .as_deref()
1613 .expect("diagnostics should be present");
1614 assert!(diagnostics.contains('['));
1615 assert!(diagnostics.contains(']'));
1616 assert_ne!(diagnostics, text.trim_end());
1617 }
1618
1619 #[test]
1620 fn test_extract_email_url_information_skips_binary_string_text() {
1621 let mut builder = FileInfoBuilder::default();
1622 let options = TextDetectionOptions {
1623 collect_info: false,
1624 detect_packages: false,
1625 detect_application_packages: false,
1626 detect_system_packages: false,
1627 detect_packages_in_compiled: false,
1628 detect_copyrights: false,
1629 detect_generated: false,
1630 detect_emails: true,
1631 detect_urls: true,
1632 max_emails: 50,
1633 max_urls: 50,
1634 timeout_seconds: 120.0,
1635 };
1636
1637 extract_email_url_information(
1638 &mut builder,
1639 "contact 6h@fo.lwft and visit http://gmail.com/",
1640 &options,
1641 true,
1642 );
1643
1644 let file = builder
1645 .name("binary.bin".to_string())
1646 .base_name("binary".to_string())
1647 .extension(".bin".to_string())
1648 .path("binary.bin".to_string())
1649 .file_type(FileType::File)
1650 .size(1)
1651 .build()
1652 .expect("builder should produce file info");
1653
1654 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1655 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1656 }
1657
1658 #[test]
1659 fn test_extract_email_url_information_keeps_good_binary_contacts() {
1660 let mut builder = FileInfoBuilder::default();
1661 let options = TextDetectionOptions {
1662 collect_info: false,
1663 detect_packages: false,
1664 detect_application_packages: false,
1665 detect_system_packages: false,
1666 detect_packages_in_compiled: false,
1667 detect_copyrights: false,
1668 detect_generated: false,
1669 detect_emails: true,
1670 detect_urls: true,
1671 max_emails: 50,
1672 max_urls: 50,
1673 timeout_seconds: 120.0,
1674 };
1675
1676 extract_email_url_information(
1677 &mut builder,
1678 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1679 &options,
1680 true,
1681 );
1682
1683 let file = builder
1684 .name("binary.bin".to_string())
1685 .base_name("binary".to_string())
1686 .extension(".bin".to_string())
1687 .path("binary.bin".to_string())
1688 .file_type(FileType::File)
1689 .size(1)
1690 .build()
1691 .expect("builder should produce file info");
1692
1693 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1694 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1695 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1696 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1697 }
1698
1699 #[test]
1700 fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1701 let mut builder = FileInfoBuilder::default();
1702 let options = TextDetectionOptions {
1703 collect_info: false,
1704 detect_packages: false,
1705 detect_application_packages: false,
1706 detect_system_packages: false,
1707 detect_packages_in_compiled: false,
1708 detect_copyrights: false,
1709 detect_generated: false,
1710 detect_emails: true,
1711 detect_urls: false,
1712 max_emails: 2,
1713 max_urls: 50,
1714 timeout_seconds: 120.0,
1715 };
1716
1717 extract_email_url_information(
1718 &mut builder,
1719 "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1720 &options,
1721 true,
1722 );
1723
1724 let file = builder
1725 .name("binary.bin".to_string())
1726 .base_name("binary".to_string())
1727 .extension(".bin".to_string())
1728 .path("binary.bin".to_string())
1729 .file_type(FileType::File)
1730 .size(1)
1731 .build()
1732 .expect("builder should produce file info");
1733
1734 assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1735 assert_eq!(file.emails[0].email, "jakub@redhat.com");
1736 assert_eq!(file.emails[1].email, "contyk@redhat.com");
1737 }
1738
1739 #[test]
1740 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1741 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1742 assert!(!is_binary_string_copyright_candidate(gibberish));
1743 }
1744
1745 #[test]
1746 fn test_binary_string_copyright_candidate_keeps_real_notice() {
1747 let notice = "Copyright nexB and others (c) 2012";
1748 assert!(is_binary_string_copyright_candidate(notice));
1749 }
1750
1751 #[test]
1752 fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1753 assert!(!is_binary_string_copyright_candidate(
1754 "Copyright - split out libs"
1755 ));
1756 }
1757
1758 #[test]
1759 fn test_binary_string_email_candidate_rejects_gibberish() {
1760 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1761 }
1762
1763 #[test]
1764 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1765 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1766 }
1767
1768 #[test]
1769 fn test_binary_string_url_candidate_rejects_short_fake_host() {
1770 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1771 }
1772
1773 #[test]
1774 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1775 assert!(is_binary_string_url_candidate(
1776 "https://www.gnu.org/software/coreutils/"
1777 ));
1778 }
1779
1780 #[test]
1781 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1782 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1783 }
1784
1785 #[test]
1786 fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
1787 assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
1788 }
1789
1790 #[test]
1791 fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
1792 assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
1793 }
1794
1795 #[test]
1796 fn test_binary_string_url_candidate_keeps_short_project_path() {
1797 assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
1798 }
1799
1800 #[test]
1801 fn test_binary_string_author_candidate_keeps_named_author_with_email() {
1802 assert!(is_binary_string_author_candidate(
1803 "Andreas Schneider <asn@redhat.com>"
1804 ));
1805 }
1806
1807 #[test]
1808 fn test_binary_string_author_candidate_rejects_gibberish() {
1809 assert!(!is_binary_string_author_candidate(
1810 "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
1811 ));
1812 }
1813
1814 #[test]
1815 fn test_binary_string_author_candidate_rejects_changelog_phrase() {
1816 assert!(!is_binary_string_author_candidate(
1817 "Developers can enable them. - revert news user back to"
1818 ));
1819 }
1820
1821 #[test]
1822 fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
1823 assert_eq!(
1824 extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
1825 Some("Andreas Schneider <asn@redhat.com>".to_string())
1826 );
1827 }
1828
1829 #[test]
1830 fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
1831 assert_eq!(
1832 extract_named_author_from_binary_line(
1833 "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
1834 ),
1835 Some("Rob Crittenden (rcritten@redhat.com)".to_string())
1836 );
1837 }
1838
1839 #[test]
1840 fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
1841 assert_eq!(
1842 extract_named_author_from_binary_line(
1843 "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
1844 ),
1845 None
1846 );
1847 }
1848
1849 #[test]
1850 fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
1851 assert_eq!(
1852 extract_named_author_from_binary_line(
1853 "Changes as per initial review by panemade@gmail.com"
1854 ),
1855 Some("panemade@gmail.com".to_string())
1856 );
1857 }
1858
1859 #[test]
1860 fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
1861 assert!(!is_binary_string_author_candidate(
1862 "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
1863 ));
1864 }
1865
1866 #[test]
1867 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1868 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1869 let text = "alpha MIT omega";
1870 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1871 let mut detection = make_detection("");
1872 detection.matches[0].coordinates =
1873 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1874 detection.matches[0].start_token = 1;
1875 detection.matches[0].end_token = 2;
1876
1877 let percentage = compute_percentage_of_license_text(&query, &[detection]);
1878
1879 assert_eq!(percentage, 33.33);
1880 }
1881
1882 #[test]
1883 fn test_scan_options_fingerprint_changes_with_license_score() {
1884 let text_options = crate::scanner::TextDetectionOptions::default();
1885 let default_fingerprint = scan_options_fingerprint(
1886 &text_options,
1887 LicenseScanOptions {
1888 min_score: 0,
1889 ..LicenseScanOptions::default()
1890 },
1891 None,
1892 );
1893 let filtered_fingerprint = scan_options_fingerprint(
1894 &text_options,
1895 LicenseScanOptions {
1896 min_score: 70,
1897 ..LicenseScanOptions::default()
1898 },
1899 None,
1900 );
1901
1902 assert_ne!(default_fingerprint, filtered_fingerprint);
1903 }
1904
1905 #[test]
1906 fn test_is_go_non_production_source_for_test_filename() {
1907 let temp_dir = tempdir().unwrap();
1908 let path = temp_dir.path().join("scanner_test.go");
1909 fs::write(&path, "package scanner\n").unwrap();
1910
1911 assert!(is_go_non_production_source(&path).unwrap());
1912 }
1913
1914 #[test]
1915 fn test_is_go_non_production_source_for_build_tag() {
1916 let temp_dir = tempdir().unwrap();
1917 let path = temp_dir.path().join("scanner.go");
1918 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1919
1920 assert!(is_go_non_production_source(&path).unwrap());
1921 }
1922
1923 #[test]
1924 fn test_is_go_non_production_source_for_regular_go_file() {
1925 let temp_dir = tempdir().unwrap();
1926 let path = temp_dir.path().join("scanner.go");
1927 fs::write(&path, "package scanner\n").unwrap();
1928
1929 assert!(!is_go_non_production_source(&path).unwrap());
1930 }
1931}