1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::fs::{self, File};
10use std::io::{Read, Write};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::copyright::{
16 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
17};
18use crate::finder::{self, DetectionConfig};
19use crate::license_detection::PositionSet;
20use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
21use crate::license_detection::query::Query;
22use crate::models::{
23 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
24 Match, OutputEmail, OutputURL,
25};
26use crate::progress::ScanProgress;
27use crate::scanner::collect::CollectedPaths;
28use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
29use crate::utils::file::{
30 ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
31};
32use crate::utils::generated::generated_code_hints_from_bytes;
33use tempfile::TempDir;
34
35const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
36 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
37 (
38 "-----BEGIN TRUSTED CERTIFICATE-----",
39 "-----END TRUSTED CERTIFICATE-----",
40 ),
41];
42
43pub fn process_collected(
44 collected: &CollectedPaths,
45 progress: Arc<ScanProgress>,
46 license_engine: Option<Arc<LicenseDetectionEngine>>,
47 license_options: LicenseScanOptions,
48 text_options: &TextDetectionOptions,
49) -> ProcessResult {
50 let mut all_files: Vec<FileInfo> = collected
51 .files
52 .par_iter()
53 .map(|(path, metadata)| {
54 let file_entry = process_file(
55 path,
56 metadata,
57 progress.as_ref(),
58 license_engine.clone(),
59 license_options,
60 text_options,
61 );
62 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
63 file_entry
64 })
65 .collect();
66
67 for (path, metadata) in &collected.directories {
68 all_files.push(process_directory(
69 path,
70 metadata,
71 text_options.collect_info,
72 license_engine.is_some(),
73 ));
74 }
75
76 ProcessResult {
77 files: all_files,
78 excluded_count: collected.excluded_count,
79 }
80}
81
82pub fn process_collected_with_memory_limit(
83 collected: &CollectedPaths,
84 progress: Arc<ScanProgress>,
85 license_engine: Option<Arc<LicenseDetectionEngine>>,
86 license_options: LicenseScanOptions,
87 text_options: &TextDetectionOptions,
88 max_in_memory: i64,
89) -> ProcessResult {
90 if max_in_memory == 0 {
91 return process_collected(
92 collected,
93 progress,
94 license_engine,
95 license_options,
96 text_options,
97 );
98 }
99
100 let memory_limit = if max_in_memory < 0 {
101 0
102 } else {
103 max_in_memory as usize
104 };
105 let chunk_size = if max_in_memory < 0 {
106 256
107 } else {
108 memory_limit.max(1)
109 };
110
111 let mut retained_files = Vec::new();
112 let mut spill_store = None;
113
114 for chunk in collected.files.chunks(chunk_size) {
115 let processed_chunk: Vec<FileInfo> = chunk
116 .par_iter()
117 .map(|(path, metadata)| {
118 let file_entry = process_file(
119 path,
120 metadata,
121 progress.as_ref(),
122 license_engine.clone(),
123 license_options,
124 text_options,
125 );
126 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
127 file_entry
128 })
129 .collect();
130
131 retain_or_spill_chunk(
132 processed_chunk,
133 &mut retained_files,
134 &mut spill_store,
135 memory_limit,
136 );
137 }
138
139 for (path, metadata) in &collected.directories {
140 let entry = process_directory(
141 path,
142 metadata,
143 text_options.collect_info,
144 license_engine.is_some(),
145 );
146 retain_or_spill_chunk(
147 vec![entry],
148 &mut retained_files,
149 &mut spill_store,
150 memory_limit,
151 );
152 }
153
154 if let Some(spill_store) = spill_store {
155 retained_files.extend(spill_store.load_all());
156 }
157
158 ProcessResult {
159 files: retained_files,
160 excluded_count: collected.excluded_count,
161 }
162}
163
164fn retain_or_spill_chunk(
165 chunk: Vec<FileInfo>,
166 retained_files: &mut Vec<FileInfo>,
167 spill_store: &mut Option<FileInfoSpillStore>,
168 memory_limit: usize,
169) {
170 if memory_limit == 0 {
171 spill_store
172 .get_or_insert_with(FileInfoSpillStore::new)
173 .spill(chunk);
174 return;
175 }
176
177 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
178 if remaining_capacity >= chunk.len() && spill_store.is_none() {
179 retained_files.extend(chunk);
180 return;
181 }
182
183 let mut chunk_iter = chunk.into_iter();
184 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
185 let overflow: Vec<FileInfo> = chunk_iter.collect();
186 if !overflow.is_empty() {
187 spill_store
188 .get_or_insert_with(FileInfoSpillStore::new)
189 .spill(overflow);
190 }
191}
192
193struct FileInfoSpillStore {
194 temp_dir: TempDir,
195 batch_index: usize,
196}
197
198impl FileInfoSpillStore {
199 fn new() -> Self {
200 Self {
201 temp_dir: TempDir::new().expect("create spill dir"),
202 batch_index: 0,
203 }
204 }
205
206 fn spill(&mut self, files: Vec<FileInfo>) {
207 let path = self
208 .temp_dir
209 .path()
210 .join(format!("batch-{:06}.json.zst", self.batch_index));
211 self.batch_index += 1;
212
213 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
214 let file = File::create(path).expect("create spill batch file");
215 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
216 encoder
217 .write_all(&payload)
218 .expect("write spilled file batch");
219 encoder.finish().expect("finish spill encoder");
220 }
221
222 fn load_all(self) -> Vec<FileInfo> {
223 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
224 .expect("read spill dir")
225 .filter_map(Result::ok)
226 .map(|entry| entry.path())
227 .collect();
228 paths.sort();
229
230 let mut files = Vec::new();
231 for path in paths {
232 let file = File::open(path).expect("open spill batch");
233 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
234 let mut payload = Vec::new();
235 decoder.read_to_end(&mut payload).expect("read spill batch");
236 let mut batch: Vec<FileInfo> =
237 serde_json::from_slice(&payload).expect("decode spilled file batch");
238 files.append(&mut batch);
239 }
240 files
241 }
242}
243
244fn process_file(
245 path: &Path,
246 metadata: &fs::Metadata,
247 progress: &ScanProgress,
248 license_engine: Option<Arc<LicenseDetectionEngine>>,
249 license_options: LicenseScanOptions,
250 text_options: &TextDetectionOptions,
251) -> FileInfo {
252 let mut scan_errors: Vec<String> = vec![];
253 let mut file_info_builder = FileInfoBuilder::default();
254 let license_enabled = license_engine.is_some();
255
256 let started = Instant::now();
257
258 let mut generated_flag = None;
259 let mut is_source_file = false;
260 match extract_information_from_content(
261 &mut file_info_builder,
262 &mut scan_errors,
263 path,
264 progress,
265 license_engine,
266 license_options,
267 text_options,
268 ) {
269 Ok((is_generated, sha256, is_source)) => {
270 generated_flag = is_generated;
271 is_source_file = is_source;
272 let _ = sha256;
273 }
274 Err(e) => scan_errors.push(e.to_string()),
275 };
276
277 if is_timeout_exceeded(started, text_options.timeout_seconds) {
278 scan_errors.push(format!(
279 "Processing interrupted due to timeout after {:.2} seconds",
280 text_options.timeout_seconds
281 ));
282 }
283
284 let mut file_info = file_info_builder
285 .name(path.file_name().unwrap().to_string_lossy().to_string())
286 .base_name(
287 path.file_stem()
288 .unwrap_or_default()
289 .to_string_lossy()
290 .to_string(),
291 )
292 .extension(
293 path.extension()
294 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
295 )
296 .path(path.to_string_lossy().to_string())
297 .file_type(FileType::File)
298 .size(metadata.len())
299 .date(
300 text_options
301 .collect_info
302 .then(|| get_creation_date(metadata))
303 .flatten(),
304 )
305 .scan_errors(scan_errors)
306 .build()
307 .expect("FileInformationBuild not completely initialized");
308
309 if text_options.collect_info {
310 file_info.is_source = Some(is_source_file);
311 }
312
313 if file_info.programming_language.as_deref() == Some("Go")
314 && is_go_non_production_source(path).unwrap_or(false)
315 {
316 file_info.is_source = Some(false);
317 }
318
319 if text_options.detect_generated {
320 file_info.is_generated = Some(generated_flag.unwrap_or(false));
321 }
322
323 if file_info.percentage_of_license_text.is_none() && license_enabled {
324 file_info.percentage_of_license_text = Some(0.0);
325 }
326
327 file_info
328}
329
330fn extract_information_from_content(
331 file_info_builder: &mut FileInfoBuilder,
332 scan_errors: &mut Vec<String>,
333 path: &Path,
334 progress: &ScanProgress,
335 license_engine: Option<Arc<LicenseDetectionEngine>>,
336 license_options: LicenseScanOptions,
337 text_options: &TextDetectionOptions,
338) -> Result<(Option<bool>, String, bool), Error> {
339 let started = Instant::now();
340 let buffer = fs::read(path)?;
341 let license_enabled = license_engine.is_some();
342
343 if is_timeout_exceeded(started, text_options.timeout_seconds) {
344 return Err(Error::msg(format!(
345 "Timeout while reading file content (> {:.2}s)",
346 text_options.timeout_seconds
347 )));
348 }
349
350 let sha256 = calculate_sha256(&buffer);
351 let is_generated = text_options
352 .detect_generated
353 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
354 let classification = classify_file_info(path, &buffer);
355
356 if text_options.collect_info {
357 file_info_builder
358 .sha1(Some(calculate_sha1(&buffer)))
359 .md5(Some(calculate_md5(&buffer)))
360 .sha256(Some(sha256.clone()))
361 .programming_language(classification.programming_language.clone())
362 .mime_type(Some(classification.mime_type.clone()))
363 .file_type_label(Some(classification.file_type.clone()))
364 .sha1_git(Some(calculate_sha1_git(&buffer)))
365 .is_binary(Some(classification.is_binary))
366 .is_text(Some(classification.is_text))
367 .is_archive(Some(classification.is_archive))
368 .is_media(Some(classification.is_media))
369 .is_source(Some(classification.is_source))
370 .is_script(Some(classification.is_script))
371 .files_count(Some(0))
372 .dirs_count(Some(0))
373 .size_count(Some(0));
374 }
375
376 if should_skip_text_detection(path, &buffer) {
377 return Ok((is_generated, sha256, classification.is_source));
378 }
379
380 if text_options.detect_packages {
383 let started = Instant::now();
384 let parse_result = try_parse_file(path).or_else(|| {
385 text_options
386 .detect_packages_in_compiled
387 .then(|| try_parse_compiled_bytes(&buffer))
388 .flatten()
389 });
390
391 if let Some(parse_result) = parse_result {
392 let packages = parse_result
393 .packages
394 .into_iter()
395 .filter(|package| {
396 let is_compiled_package = package
397 .datasource_id
398 .as_ref()
399 .is_some_and(is_compiled_datasource);
400 let is_system_package = package
401 .datasource_id
402 .as_ref()
403 .is_some_and(is_system_datasource);
404 if is_compiled_package {
405 text_options.detect_packages_in_compiled
406 } else if is_system_package {
407 text_options.detect_system_packages
408 } else {
409 text_options.detect_application_packages
410 }
411 })
412 .collect();
413 file_info_builder.package_data(packages);
414 scan_errors.extend(parse_result.scan_errors);
415 }
416 progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
417 }
418
419 if is_timeout_exceeded(started, text_options.timeout_seconds) {
420 return Err(Error::msg(format!(
421 "Timeout while extracting package/text metadata (> {:.2}s)",
422 text_options.timeout_seconds
423 )));
424 }
425
426 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
427 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
428
429 if is_timeout_exceeded(started, text_options.timeout_seconds) {
430 return Err(Error::msg(format!(
431 "Timeout while extracting text content (> {:.2}s)",
432 text_options.timeout_seconds
433 )));
434 }
435
436 if text_content.is_empty() {
437 return Ok((is_generated, sha256, classification.is_source));
438 }
439
440 if text_options.detect_copyrights {
441 extract_copyright_information(
442 file_info_builder,
443 path,
444 &text_content,
445 text_options.timeout_seconds,
446 from_binary_strings,
447 );
448 }
449 extract_email_url_information(
450 file_info_builder,
451 &text_content,
452 text_options,
453 from_binary_strings,
454 );
455
456 if is_timeout_exceeded(started, text_options.timeout_seconds) {
457 return Err(Error::msg(format!(
458 "Timeout before license scan (> {:.2}s)",
459 text_options.timeout_seconds
460 )));
461 }
462 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
464 if let Some(sourcemap_content) =
465 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
466 {
467 sourcemap_content
468 } else {
469 text_content
470 }
471 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
472 remove_verbatim_escape_sequences(&text_content)
473 } else {
474 text_content
475 };
476
477 if license_enabled {
478 let started = Instant::now();
479 extract_license_information(
480 file_info_builder,
481 scan_errors,
482 path,
483 text_content_for_license_detection,
484 license_engine,
485 license_options,
486 from_binary_strings,
487 )?;
488 progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
489 } else {
490 extract_license_information(
491 file_info_builder,
492 scan_errors,
493 path,
494 text_content_for_license_detection,
495 license_engine,
496 license_options,
497 from_binary_strings,
498 )?;
499 }
500
501 Ok((is_generated, sha256, classification.is_source))
502}
503
504fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
505 timeout_seconds.is_finite()
506 && timeout_seconds > 0.0
507 && started.elapsed().as_secs_f64() > timeout_seconds
508}
509
510fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
511 matches!(
512 datasource_id,
513 DatasourceId::AlpineInstalledDb
514 | DatasourceId::DebianDistrolessInstalledDb
515 | DatasourceId::DebianInstalledFilesList
516 | DatasourceId::DebianInstalledMd5Sums
517 | DatasourceId::DebianInstalledStatusDb
518 | DatasourceId::FreebsdCompactManifest
519 | DatasourceId::RpmInstalledDatabaseBdb
520 | DatasourceId::RpmInstalledDatabaseNdb
521 | DatasourceId::RpmInstalledDatabaseSqlite
522 | DatasourceId::RpmYumdb
523 )
524}
525
526fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
527 matches!(
528 datasource_id,
529 DatasourceId::GoBinary | DatasourceId::RustBinary
530 )
531}
532
533fn extract_copyright_information(
534 file_info_builder: &mut FileInfoBuilder,
535 path: &Path,
536 text_content: &str,
537 timeout_seconds: f64,
538 from_binary_strings: bool,
539) {
540 if copyright::is_credits_file(path) {
542 let author_detections = copyright::detect_credits_authors(text_content);
543 if !author_detections.is_empty() {
544 file_info_builder.authors(
545 author_detections
546 .into_iter()
547 .map(|a| Author {
548 author: a.author,
549 start_line: a.start_line,
550 end_line: a.end_line,
551 })
552 .collect(),
553 );
554 return;
555 }
556 }
557
558 let copyright_options = CopyrightDetectionOptions {
559 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
560 Some(Duration::from_secs_f64(timeout_seconds))
561 } else {
562 None
563 },
564 ..CopyrightDetectionOptions::default()
565 };
566
567 let (copyrights, holders, authors) =
568 copyright::detect_copyrights_with_options(text_content, ©right_options);
569 let (copyrights, holders, authors) = if from_binary_strings {
570 prune_binary_string_detections(copyrights, holders, authors)
571 } else {
572 (copyrights, holders, authors)
573 };
574
575 file_info_builder.copyrights(
576 copyrights
577 .into_iter()
578 .map(|c| Copyright {
579 copyright: c.copyright,
580 start_line: c.start_line,
581 end_line: c.end_line,
582 })
583 .collect::<Vec<Copyright>>(),
584 );
585 file_info_builder.holders(
586 holders
587 .into_iter()
588 .map(|h| Holder {
589 holder: h.holder,
590 start_line: h.start_line,
591 end_line: h.end_line,
592 })
593 .collect::<Vec<Holder>>(),
594 );
595 file_info_builder.authors(
596 authors
597 .into_iter()
598 .map(|a| Author {
599 author: a.author,
600 start_line: a.start_line,
601 end_line: a.end_line,
602 })
603 .collect::<Vec<Author>>(),
604 );
605}
606
607fn prune_binary_string_detections(
608 copyrights: Vec<CopyrightDetection>,
609 holders: Vec<HolderDetection>,
610 _authors: Vec<AuthorDetection>,
611) -> (
612 Vec<CopyrightDetection>,
613 Vec<HolderDetection>,
614 Vec<AuthorDetection>,
615) {
616 let kept_copyrights: Vec<CopyrightDetection> = copyrights
617 .into_iter()
618 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
619 .collect();
620
621 let kept_holders: Vec<HolderDetection> = holders
622 .into_iter()
623 .filter(|holder| {
624 kept_copyrights.iter().any(|copyright| {
625 ranges_overlap(
626 holder.start_line,
627 holder.end_line,
628 copyright.start_line,
629 copyright.end_line,
630 )
631 })
632 })
633 .collect();
634
635 (kept_copyrights, kept_holders, Vec::new())
636}
637
638fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
639 a_start <= b_end && b_start <= a_end
640}
641
642fn is_binary_string_copyright_candidate(text: &str) -> bool {
643 if contains_year(text) {
644 return true;
645 }
646
647 let lower = text.to_ascii_lowercase();
648 let tail = if let Some(tail) = lower.strip_prefix("copyright") {
649 tail.trim()
650 } else {
651 lower.trim()
652 };
653
654 if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
655 return false;
656 }
657
658 let alpha_tokens: Vec<&str> = tail
659 .split_whitespace()
660 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
661 .collect();
662
663 if alpha_tokens.len() <= 1 {
664 return has_explicit_copyright_marker(text)
665 && alpha_tokens.iter().any(|token| {
666 is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
667 });
668 }
669
670 if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
671 return true;
672 }
673
674 alpha_tokens
675 .iter()
676 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
677 || alpha_tokens
678 .iter()
679 .filter(|token| token.chars().filter(|c| c.is_alphabetic()).count() >= 3)
680 .count()
681 >= 2
682}
683
684fn has_sufficient_alphabetic_content(text: &str) -> bool {
685 let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
686 if alnum_count == 0 {
687 return false;
688 }
689
690 let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
691 alpha_count * 2 >= alnum_count
692}
693
694fn has_excessive_at_noise(text: &str) -> bool {
695 text.chars().filter(|c| *c == '@').count() >= 3
696}
697
698fn has_explicit_copyright_marker(text: &str) -> bool {
699 let lower = text.to_ascii_lowercase();
700 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
701}
702
703fn contains_year(text: &str) -> bool {
704 let bytes = text.as_bytes();
705 bytes.windows(4).any(|window| {
706 window.iter().all(|b| b.is_ascii_digit())
707 && matches!(window[0], b'1' | b'2')
708 && matches!(window[1], b'9' | b'0')
709 })
710}
711
712fn is_company_like_suffix(token: &str) -> bool {
713 matches!(
714 token.to_ascii_lowercase().as_str(),
715 "inc"
716 | "corp"
717 | "corporation"
718 | "co"
719 | "company"
720 | "ltd"
721 | "llc"
722 | "gmbh"
723 | "foundation"
724 | "project"
725 | "systems"
726 | "software"
727 | "technologies"
728 | "technology"
729 )
730}
731
732fn extract_email_url_information(
733 file_info_builder: &mut FileInfoBuilder,
734 text_content: &str,
735 text_options: &TextDetectionOptions,
736 from_binary_strings: bool,
737) {
738 if !text_options.detect_emails && !text_options.detect_urls {
739 return;
740 }
741
742 if text_options.detect_emails {
743 let config = DetectionConfig {
744 max_emails: text_options.max_emails,
745 max_urls: text_options.max_urls,
746 unique: false,
747 };
748 let emails = finder::find_emails(text_content, &config)
749 .into_iter()
750 .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
751 .map(|d| OutputEmail {
752 email: d.email,
753 start_line: d.start_line,
754 end_line: d.end_line,
755 })
756 .collect::<Vec<_>>();
757 file_info_builder.emails(emails);
758 }
759
760 if text_options.detect_urls {
761 let config = DetectionConfig {
762 max_emails: text_options.max_emails,
763 max_urls: text_options.max_urls,
764 unique: true,
765 };
766 let urls = finder::find_urls(text_content, &config)
767 .into_iter()
768 .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
769 .map(|d| OutputURL {
770 url: d.url,
771 start_line: d.start_line,
772 end_line: d.end_line,
773 })
774 .collect::<Vec<_>>();
775 file_info_builder.urls(urls);
776 }
777}
778
779fn is_binary_string_email_candidate(email: &str) -> bool {
780 let Some((local, domain)) = email.rsplit_once('@') else {
781 return false;
782 };
783
784 if !has_strong_binary_local_part(local) {
785 return false;
786 }
787
788 has_strong_binary_host_shape(domain)
789}
790
791fn is_binary_string_url_candidate(url: &str) -> bool {
792 let parsed = url::Url::parse(url).ok();
793 let Some(parsed) = parsed else {
794 return false;
795 };
796 let Some(host) = parsed.host_str() else {
797 return false;
798 };
799
800 has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
801}
802
803fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
804 if parsed.path() != "/"
805 && parsed
806 .path()
807 .split('/')
808 .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 3)
809 {
810 return true;
811 }
812
813 if parsed.query().is_some() || parsed.fragment().is_some() {
814 return true;
815 }
816
817 let Some(host) = parsed.host_str() else {
818 return false;
819 };
820
821 let labels: Vec<&str> = host.split('.').collect();
822 if matches!(labels.first(), Some(&"www")) {
823 return true;
824 }
825
826 labels
827 .iter()
828 .take(labels.len().saturating_sub(1))
829 .any(|label| {
830 label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
831 })
832}
833
834fn has_strong_binary_local_part(local: &str) -> bool {
835 local
836 .split(|c: char| !c.is_ascii_alphabetic())
837 .any(|segment| segment.len() >= 3)
838}
839
840fn has_strong_binary_host_shape(host: &str) -> bool {
841 let labels: Vec<&str> = host.split('.').collect();
842 if labels.len() < 2 {
843 return false;
844 }
845
846 let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
847 &labels[1..]
848 } else {
849 &labels[..]
850 };
851
852 if relevant.len() < 2 {
853 return false;
854 }
855
856 relevant[..relevant.len() - 1].iter().any(|label| {
857 label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
858 })
859}
860
861fn extract_license_information(
862 file_info_builder: &mut FileInfoBuilder,
863 scan_errors: &mut Vec<String>,
864 path: &Path,
865 text_content: String,
866 license_engine: Option<Arc<LicenseDetectionEngine>>,
867 license_options: LicenseScanOptions,
868 from_binary_strings: bool,
869) -> Result<(), Error> {
870 let Some(engine) = license_engine else {
871 return Ok(());
872 };
873
874 let detection_result = if license_options.min_score == 0 {
875 engine.detect_with_kind_and_source(
876 &text_content,
877 license_options.unknown_licenses,
878 from_binary_strings,
879 &path.to_string_lossy(),
880 )
881 } else {
882 engine.detect_with_kind_and_source_with_score(
883 &text_content,
884 license_options.unknown_licenses,
885 from_binary_strings,
886 &path.to_string_lossy(),
887 license_options.min_score as f32,
888 )
889 };
890
891 match detection_result {
892 Ok(detections) => {
893 let query =
894 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
895 let mut model_detections = Vec::new();
896 let mut model_clues = Vec::new();
897
898 for detection in &detections {
899 let (public_detection, clue_matches) = convert_detection_to_model(
900 detection,
901 license_options,
902 &text_content,
903 query.as_ref(),
904 );
905
906 if let Some(public_detection) = public_detection {
907 model_detections.push(public_detection);
908 }
909
910 model_clues.extend(clue_matches);
911 }
912
913 if !model_detections.is_empty() {
914 let expressions: Vec<String> = model_detections
915 .iter()
916 .filter(|d| !d.license_expression_spdx.is_empty())
917 .map(|d| d.license_expression_spdx.clone())
918 .collect();
919
920 if !expressions.is_empty() {
921 let combined = crate::utils::spdx::combine_license_expressions(expressions);
922 if let Some(expr) = combined {
923 file_info_builder.license_expression(Some(expr));
924 }
925 }
926 }
927
928 file_info_builder.license_detections(model_detections);
929 file_info_builder.license_clues(model_clues);
930 file_info_builder.percentage_of_license_text(
931 query
932 .as_ref()
933 .map(|query| compute_percentage_of_license_text(query, &detections)),
934 );
935 }
936 Err(e) => {
937 scan_errors.push(format!("License detection failed: {}", e));
938 }
939 }
940
941 Ok(())
942}
943
944fn convert_detection_to_model(
945 detection: &crate::license_detection::LicenseDetection,
946 license_options: LicenseScanOptions,
947 text_content: &str,
948 query: Option<&Query<'_>>,
949) -> (Option<LicenseDetection>, Vec<Match>) {
950 let matches: Vec<Match> = detection
951 .matches
952 .iter()
953 .map(|m| convert_match_to_model(m, license_options, text_content, query))
954 .collect();
955
956 if let Some(license_expression) = detection.license_expression.clone() {
957 (
958 Some(LicenseDetection {
959 license_expression,
960 license_expression_spdx: detection
961 .license_expression_spdx
962 .clone()
963 .unwrap_or_default(),
964 matches,
965 detection_log: if license_options.include_diagnostics {
966 detection.detection_log.clone()
967 } else {
968 Vec::new()
969 },
970 identifier: detection.identifier.clone(),
971 }),
972 Vec::new(),
973 )
974 } else {
975 (None, matches)
976 }
977}
978
979fn convert_match_to_model(
980 m: &crate::license_detection::models::LicenseMatch,
981 license_options: LicenseScanOptions,
982 text_content: &str,
983 query: Option<&Query<'_>>,
984) -> Match {
985 let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
986 let rule_url = if m.rule_url.is_empty() {
987 None
988 } else {
989 Some(m.rule_url.clone())
990 };
991 let matched_text = if license_options.include_text {
992 m.matched_text.clone().or_else(|| {
993 Some(crate::license_detection::query::matched_text_from_text(
994 text_content,
995 m.start_line,
996 m.end_line,
997 ))
998 })
999 } else {
1000 None
1001 };
1002 let matched_text_diagnostics = if license_options.include_text_diagnostics {
1003 query.map(|query| matched_text_diagnostics_from_match(query, m))
1004 } else {
1005 None
1006 };
1007 Match {
1008 license_expression: m.license_expression.clone(),
1009 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1010 from_file: m.from_file.clone(),
1011 start_line: m.start_line,
1012 end_line: m.end_line,
1013 matcher: Some(m.matcher.to_string()),
1014 score: output_metric(m.score),
1015 matched_length: Some(m.matched_length),
1016 match_coverage: Some(output_metric(m.coverage())),
1017 rule_relevance: Some(m.rule_relevance as usize),
1018 rule_identifier: Some(m.rule_identifier.clone()),
1019 rule_url,
1020 matched_text,
1021 referenced_filenames: m.referenced_filenames.clone(),
1022 matched_text_diagnostics,
1023 }
1024}
1025
1026fn compute_percentage_of_license_text(
1027 query: &Query<'_>,
1028 detections: &[crate::license_detection::LicenseDetection],
1029) -> f64 {
1030 let matched_positions: std::collections::HashSet<usize> = detections
1031 .iter()
1032 .flat_map(|detection| detection.matches.iter())
1033 .flat_map(|m| m.query_span().iter())
1034 .collect();
1035
1036 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1037 if query_tokens_length == 0 {
1038 return 0.0;
1039 }
1040
1041 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1042 (percentage * 100.0).round() / 100.0
1043}
1044
1045fn matched_text_diagnostics_from_match(
1046 query: &Query<'_>,
1047 license_match: &InternalLicenseMatch,
1048) -> String {
1049 let matched_positions: PositionSet = license_match.query_span().iter().collect();
1050 let Some(start_pos) = matched_positions.iter().min() else {
1051 return crate::license_detection::query::matched_text_from_text(
1052 &query.text,
1053 license_match.start_line,
1054 license_match.end_line,
1055 );
1056 };
1057 let Some(end_pos) = matched_positions.iter().max() else {
1058 return crate::license_detection::query::matched_text_from_text(
1059 &query.text,
1060 license_match.start_line,
1061 license_match.end_line,
1062 );
1063 };
1064
1065 crate::license_detection::query::matched_text_diagnostics_from_text(
1066 &query.text,
1067 query,
1068 &matched_positions,
1069 start_pos,
1070 end_pos,
1071 license_match.start_line,
1072 license_match.end_line,
1073 )
1074}
1075
1076fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1077 is_pem_certificate_file(path, buffer)
1078}
1079
1080fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1081 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1082 return Ok(false);
1083 }
1084
1085 if path
1086 .file_name()
1087 .and_then(|name| name.to_str())
1088 .is_some_and(|name| name.ends_with("_test.go"))
1089 {
1090 return Ok(true);
1091 }
1092
1093 let content = fs::read_to_string(path)?;
1094 Ok(content.lines().take(10).any(|line| {
1095 let trimmed = line.trim();
1096 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1097 && trimmed.split_whitespace().any(|token| token == "test")
1098 }))
1099}
1100
1101fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1102 let prefix_len = buffer.len().min(8192);
1103 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1104 let trimmed_lines: Vec<&str> = prefix
1105 .lines()
1106 .map(str::trim)
1107 .filter(|line| !line.is_empty())
1108 .take(64)
1109 .collect();
1110
1111 let Some(first_line) = trimmed_lines.first().copied() else {
1112 return false;
1113 };
1114
1115 PEM_CERTIFICATE_HEADERS
1116 .iter()
1117 .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1118}
1119
1120fn process_directory(
1121 path: &Path,
1122 _metadata: &fs::Metadata,
1123 collect_info: bool,
1124 license_enabled: bool,
1125) -> FileInfo {
1126 let name = path
1127 .file_name()
1128 .unwrap_or_default()
1129 .to_string_lossy()
1130 .to_string();
1131 let base_name = name.clone(); FileInfo {
1134 name,
1135 base_name,
1136 extension: "".to_string(),
1137 path: path.to_string_lossy().to_string(),
1138 file_type: FileType::Directory,
1139 mime_type: None,
1140 file_type_label: None,
1141 size: 0,
1142 date: None,
1143 sha1: None,
1144 md5: None,
1145 sha256: None,
1146 sha1_git: None,
1147 programming_language: None,
1148 package_data: Vec::new(),
1149 license_expression: None,
1150 license_detections: Vec::new(),
1151 license_clues: Vec::new(),
1152 percentage_of_license_text: license_enabled.then_some(0.0),
1153 copyrights: Vec::new(),
1154 holders: Vec::new(),
1155 authors: Vec::new(),
1156 emails: Vec::new(),
1157 urls: Vec::new(),
1158 for_packages: Vec::new(),
1159 scan_errors: Vec::new(),
1160 license_policy: None,
1161 is_binary: collect_info.then_some(false),
1162 is_text: collect_info.then_some(false),
1163 is_archive: collect_info.then_some(false),
1164 is_media: collect_info.then_some(false),
1165 is_source: collect_info.then_some(false),
1166 is_script: collect_info.then_some(false),
1167 files_count: collect_info.then_some(0),
1168 dirs_count: collect_info.then_some(0),
1169 size_count: collect_info.then_some(0),
1170 source_count: None,
1171 is_legal: false,
1172 is_manifest: false,
1173 is_readme: false,
1174 is_top_level: false,
1175 is_key_file: false,
1176 is_community: false,
1177 is_generated: None,
1178 facets: vec![],
1179 tallies: None,
1180 }
1181}
1182
1183#[cfg(test)]
1184mod tests {
1185 use super::{
1186 compute_percentage_of_license_text, convert_detection_to_model,
1187 extract_email_url_information, is_binary_string_copyright_candidate,
1188 is_binary_string_email_candidate, is_binary_string_url_candidate,
1189 is_go_non_production_source,
1190 };
1191 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1192 use crate::license_detection::index::LicenseIndex;
1193 use crate::license_detection::index::dictionary::TokenDictionary;
1194 use crate::license_detection::models::position_span::PositionSpan;
1195 use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1196 use crate::license_detection::query::Query;
1197 use crate::models::{FileInfoBuilder, FileType};
1198 use crate::scanner::scan_options_fingerprint;
1199 use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1200 use std::fs;
1201 use tempfile::tempdir;
1202
1203 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1204 LicenseMatch {
1205 rid: 0,
1206 license_expression: "mit".to_string(),
1207 license_expression_spdx: Some("MIT".to_string()),
1208 from_file: None,
1209 start_line: 1,
1210 end_line: 1,
1211 start_token: 0,
1212 end_token: 1,
1213 matcher: MatcherKind::Hash,
1214 score: 1.0,
1215 matched_length: 3,
1216 rule_length: 3,
1217 match_coverage: 100.0,
1218 rule_relevance: 100,
1219 rule_identifier: "mit.LICENSE".to_string(),
1220 rule_url: rule_url.to_string(),
1221 matched_text: Some("MIT".to_string()),
1222 referenced_filenames: None,
1223 rule_kind: RuleKind::Text,
1224 is_from_license: true,
1225 rule_start_token: 0,
1226 coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1227 candidate_resemblance: 0.0,
1228 candidate_containment: 0.0,
1229 }
1230 }
1231
1232 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1233 InternalLicenseDetection {
1234 license_expression: Some("mit".to_string()),
1235 license_expression_spdx: Some("MIT".to_string()),
1236 matches: vec![make_internal_match(rule_url)],
1237 detection_log: vec![],
1238 identifier: Some("mit-test".to_string()),
1239 file_regions: Vec::new(),
1240 }
1241 }
1242
1243 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1244 let dictionary = TokenDictionary::new_with_legalese(entries);
1245 let mut index = LicenseIndex::new(dictionary);
1246 index.len_legalese = len_legalese;
1247 index
1248 }
1249
1250 #[test]
1251 fn test_convert_detection_to_model_preserves_rule_url() {
1252 let detection = make_detection(
1253 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1254 );
1255
1256 let (converted, clues) =
1257 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1258 let converted = converted.expect("detection should convert");
1259
1260 assert_eq!(
1261 converted.matches[0].rule_url.as_deref(),
1262 Some(
1263 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1264 )
1265 );
1266 assert!(clues.is_empty());
1267 }
1268
1269 #[test]
1270 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1271 let detection = make_detection("");
1272
1273 let (converted, clues) =
1274 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1275 let converted = converted.expect("detection should convert");
1276
1277 assert_eq!(converted.matches[0].rule_url, None);
1278 assert!(clues.is_empty());
1279 }
1280
1281 #[test]
1282 fn test_convert_detection_to_model_rounds_match_coverage() {
1283 let mut detection = make_detection("");
1284 detection.matches[0].score = 81.82;
1285 detection.matches[0].match_coverage = 33.334;
1286
1287 let (converted, clues) =
1288 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1289 let converted = converted.expect("detection should convert");
1290
1291 assert_eq!(converted.matches[0].score, 81.82);
1292 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1293 assert!(clues.is_empty());
1294 }
1295
1296 #[test]
1297 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1298 let mut detection = make_detection(
1299 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1300 );
1301 detection.license_expression = None;
1302 detection.license_expression_spdx = None;
1303 detection.identifier = None;
1304 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1305 detection.matches[0].license_expression_spdx =
1306 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1307 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1308 detection.matches[0].rule_kind = RuleKind::Clue;
1309
1310 let (converted, clues) = convert_detection_to_model(
1311 &detection,
1312 LicenseScanOptions {
1313 include_text: true,
1314 min_score: 0,
1315 ..LicenseScanOptions::default()
1316 },
1317 "clue text",
1318 None,
1319 );
1320
1321 assert!(converted.is_none());
1322 assert_eq!(clues.len(), 1);
1323 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1324 assert_eq!(
1325 clues[0].license_expression_spdx,
1326 "LicenseRef-scancode-unknown-license-reference"
1327 );
1328 assert_eq!(
1329 clues[0].rule_identifier.as_deref(),
1330 Some("license-clue_1.RULE")
1331 );
1332 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1333 assert_eq!(clues[0].matched_text_diagnostics, None);
1334 }
1335
1336 #[test]
1337 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1338 let text = concat!(
1339 "Reproduction and distribution of this file, with or without modification, are\n",
1340 "permitted in any medium without royalties provided the copyright notice\n",
1341 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1342 );
1343 let index = create_test_index(
1344 &[
1345 ("reproduction", 0),
1346 ("distribution", 1),
1347 ("file", 2),
1348 ("without", 3),
1349 ("modification", 4),
1350 ("permitted", 5),
1351 ("medium", 6),
1352 ("royalties", 7),
1353 ("provided", 8),
1354 ("copyright", 9),
1355 ("notice", 10),
1356 ("preserved", 11),
1357 ("offered", 12),
1358 ("warranties", 13),
1359 ],
1360 14,
1361 );
1362 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1363 let mut detection = make_detection(
1364 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1365 );
1366 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1367 detection.matches[0].license_expression = "fsf-ap".to_string();
1368 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1369 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1370 detection.matches[0].matched_text = None;
1371 detection.matches[0].start_line = 1;
1372 detection.matches[0].end_line = 3;
1373 detection.matches[0].start_token = 0;
1374 detection.matches[0].end_token = query.tokens.len();
1375 detection.matches[0].coordinates =
1376 MatchCoordinates::query_region(PositionSpan::from_positions(
1377 query
1378 .tokens
1379 .iter()
1380 .enumerate()
1381 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1382 .collect::<Vec<_>>(),
1383 ));
1384 detection.identifier = Some("fsf_ap-test".to_string());
1385
1386 let (converted, clues) = convert_detection_to_model(
1387 &detection,
1388 LicenseScanOptions {
1389 include_text: true,
1390 include_text_diagnostics: true,
1391 include_diagnostics: true,
1392 unknown_licenses: false,
1393 min_score: 0,
1394 },
1395 text,
1396 Some(&query),
1397 );
1398 let converted = converted.expect("detection should convert");
1399
1400 assert!(clues.is_empty());
1401 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1402 assert_eq!(
1403 converted.matches[0].matched_text.as_deref(),
1404 Some(text.trim_end())
1405 );
1406 let diagnostics = converted.matches[0]
1407 .matched_text_diagnostics
1408 .as_deref()
1409 .expect("diagnostics should be present");
1410 assert!(diagnostics.contains('['));
1411 assert!(diagnostics.contains(']'));
1412 assert_ne!(diagnostics, text.trim_end());
1413 }
1414
1415 #[test]
1416 fn test_extract_email_url_information_skips_binary_string_text() {
1417 let mut builder = FileInfoBuilder::default();
1418 let options = TextDetectionOptions {
1419 collect_info: false,
1420 detect_packages: false,
1421 detect_application_packages: false,
1422 detect_system_packages: false,
1423 detect_packages_in_compiled: false,
1424 detect_copyrights: false,
1425 detect_generated: false,
1426 detect_emails: true,
1427 detect_urls: true,
1428 max_emails: 50,
1429 max_urls: 50,
1430 timeout_seconds: 120.0,
1431 };
1432
1433 extract_email_url_information(
1434 &mut builder,
1435 "contact 6h@fo.lwft and visit http://gmail.com/",
1436 &options,
1437 true,
1438 );
1439
1440 let file = builder
1441 .name("binary.bin".to_string())
1442 .base_name("binary".to_string())
1443 .extension(".bin".to_string())
1444 .path("binary.bin".to_string())
1445 .file_type(FileType::File)
1446 .size(1)
1447 .build()
1448 .expect("builder should produce file info");
1449
1450 assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1451 assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1452 }
1453
1454 #[test]
1455 fn test_extract_email_url_information_keeps_good_binary_contacts() {
1456 let mut builder = FileInfoBuilder::default();
1457 let options = TextDetectionOptions {
1458 collect_info: false,
1459 detect_packages: false,
1460 detect_application_packages: false,
1461 detect_system_packages: false,
1462 detect_packages_in_compiled: false,
1463 detect_copyrights: false,
1464 detect_generated: false,
1465 detect_emails: true,
1466 detect_urls: true,
1467 max_emails: 50,
1468 max_urls: 50,
1469 timeout_seconds: 120.0,
1470 };
1471
1472 extract_email_url_information(
1473 &mut builder,
1474 "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1475 &options,
1476 true,
1477 );
1478
1479 let file = builder
1480 .name("binary.bin".to_string())
1481 .base_name("binary".to_string())
1482 .extension(".bin".to_string())
1483 .path("binary.bin".to_string())
1484 .file_type(FileType::File)
1485 .size(1)
1486 .build()
1487 .expect("builder should produce file info");
1488
1489 assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1490 assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1491 assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1492 assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1493 }
1494
1495 #[test]
1496 fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1497 let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1498 assert!(!is_binary_string_copyright_candidate(gibberish));
1499 }
1500
1501 #[test]
1502 fn test_binary_string_copyright_candidate_keeps_real_notice() {
1503 let notice = "Copyright nexB and others (c) 2012";
1504 assert!(is_binary_string_copyright_candidate(notice));
1505 }
1506
1507 #[test]
1508 fn test_binary_string_email_candidate_rejects_gibberish() {
1509 assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1510 }
1511
1512 #[test]
1513 fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1514 assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1515 }
1516
1517 #[test]
1518 fn test_binary_string_url_candidate_rejects_short_fake_host() {
1519 assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1520 }
1521
1522 #[test]
1523 fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1524 assert!(is_binary_string_url_candidate(
1525 "https://www.gnu.org/software/coreutils/"
1526 ));
1527 }
1528
1529 #[test]
1530 fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1531 assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1532 }
1533
1534 #[test]
1535 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1536 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1537 let text = "alpha MIT omega";
1538 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1539 let mut detection = make_detection("");
1540 detection.matches[0].coordinates =
1541 MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1542 detection.matches[0].start_token = 1;
1543 detection.matches[0].end_token = 2;
1544
1545 let percentage = compute_percentage_of_license_text(&query, &[detection]);
1546
1547 assert_eq!(percentage, 33.33);
1548 }
1549
1550 #[test]
1551 fn test_scan_options_fingerprint_changes_with_license_score() {
1552 let text_options = crate::scanner::TextDetectionOptions::default();
1553 let default_fingerprint = scan_options_fingerprint(
1554 &text_options,
1555 LicenseScanOptions {
1556 min_score: 0,
1557 ..LicenseScanOptions::default()
1558 },
1559 None,
1560 );
1561 let filtered_fingerprint = scan_options_fingerprint(
1562 &text_options,
1563 LicenseScanOptions {
1564 min_score: 70,
1565 ..LicenseScanOptions::default()
1566 },
1567 None,
1568 );
1569
1570 assert_ne!(default_fingerprint, filtered_fingerprint);
1571 }
1572
1573 #[test]
1574 fn test_is_go_non_production_source_for_test_filename() {
1575 let temp_dir = tempdir().unwrap();
1576 let path = temp_dir.path().join("scanner_test.go");
1577 fs::write(&path, "package scanner\n").unwrap();
1578
1579 assert!(is_go_non_production_source(&path).unwrap());
1580 }
1581
1582 #[test]
1583 fn test_is_go_non_production_source_for_build_tag() {
1584 let temp_dir = tempdir().unwrap();
1585 let path = temp_dir.path().join("scanner.go");
1586 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1587
1588 assert!(is_go_non_production_source(&path).unwrap());
1589 }
1590
1591 #[test]
1592 fn test_is_go_non_production_source_for_regular_go_file() {
1593 let temp_dir = tempdir().unwrap();
1594 let path = temp_dir.path().join("scanner.go");
1595 fs::write(&path, "package scanner\n").unwrap();
1596
1597 assert!(!is_go_non_production_source(&path).unwrap());
1598 }
1599}