1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5 remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::fs::{self, File};
10use std::io::{Read, Write};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
16use crate::copyright::{
17 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
21use crate::license_detection::query::Query;
22use crate::models::{
23 Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
24 Match, OutputEmail, OutputURL,
25};
26use crate::progress::ScanProgress;
27use crate::scanner::collect::CollectedPaths;
28use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
29use crate::utils::file::{
30 ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
31};
32use crate::utils::generated::generated_code_hints_from_bytes;
33use tempfile::TempDir;
34
35const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
36 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
37 (
38 "-----BEGIN TRUSTED CERTIFICATE-----",
39 "-----END TRUSTED CERTIFICATE-----",
40 ),
41];
42
43pub fn process_collected(
44 collected: &CollectedPaths,
45 progress: Arc<ScanProgress>,
46 license_engine: Option<Arc<LicenseDetectionEngine>>,
47 license_options: LicenseScanOptions,
48 text_options: &TextDetectionOptions,
49) -> ProcessResult {
50 let mut all_files: Vec<FileInfo> = collected
51 .files
52 .par_iter()
53 .map(|(path, metadata)| {
54 let file_entry = process_file(
55 path,
56 metadata,
57 license_engine.clone(),
58 license_options,
59 text_options,
60 );
61 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
62 file_entry
63 })
64 .collect();
65
66 for (path, metadata) in &collected.directories {
67 all_files.push(process_directory(
68 path,
69 metadata,
70 text_options.collect_info,
71 license_engine.is_some(),
72 ));
73 }
74
75 ProcessResult {
76 files: all_files,
77 excluded_count: collected.excluded_count,
78 }
79}
80
81pub fn process_collected_with_memory_limit(
82 collected: &CollectedPaths,
83 progress: Arc<ScanProgress>,
84 license_engine: Option<Arc<LicenseDetectionEngine>>,
85 license_options: LicenseScanOptions,
86 text_options: &TextDetectionOptions,
87 max_in_memory: i64,
88) -> ProcessResult {
89 if max_in_memory == 0 {
90 return process_collected(
91 collected,
92 progress,
93 license_engine,
94 license_options,
95 text_options,
96 );
97 }
98
99 let memory_limit = if max_in_memory < 0 {
100 0
101 } else {
102 max_in_memory as usize
103 };
104 let chunk_size = if max_in_memory < 0 {
105 256
106 } else {
107 memory_limit.max(1)
108 };
109
110 let mut retained_files = Vec::new();
111 let mut spill_store = None;
112
113 for chunk in collected.files.chunks(chunk_size) {
114 let processed_chunk: Vec<FileInfo> = chunk
115 .par_iter()
116 .map(|(path, metadata)| {
117 let file_entry = process_file(
118 path,
119 metadata,
120 license_engine.clone(),
121 license_options,
122 text_options,
123 );
124 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
125 file_entry
126 })
127 .collect();
128
129 retain_or_spill_chunk(
130 processed_chunk,
131 &mut retained_files,
132 &mut spill_store,
133 memory_limit,
134 );
135 }
136
137 for (path, metadata) in &collected.directories {
138 let entry = process_directory(
139 path,
140 metadata,
141 text_options.collect_info,
142 license_engine.is_some(),
143 );
144 retain_or_spill_chunk(
145 vec![entry],
146 &mut retained_files,
147 &mut spill_store,
148 memory_limit,
149 );
150 }
151
152 if let Some(spill_store) = spill_store {
153 retained_files.extend(spill_store.load_all());
154 }
155
156 ProcessResult {
157 files: retained_files,
158 excluded_count: collected.excluded_count,
159 }
160}
161
162fn retain_or_spill_chunk(
163 chunk: Vec<FileInfo>,
164 retained_files: &mut Vec<FileInfo>,
165 spill_store: &mut Option<FileInfoSpillStore>,
166 memory_limit: usize,
167) {
168 if memory_limit == 0 {
169 spill_store
170 .get_or_insert_with(FileInfoSpillStore::new)
171 .spill(chunk);
172 return;
173 }
174
175 let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
176 if remaining_capacity >= chunk.len() && spill_store.is_none() {
177 retained_files.extend(chunk);
178 return;
179 }
180
181 let mut chunk_iter = chunk.into_iter();
182 retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
183 let overflow: Vec<FileInfo> = chunk_iter.collect();
184 if !overflow.is_empty() {
185 spill_store
186 .get_or_insert_with(FileInfoSpillStore::new)
187 .spill(overflow);
188 }
189}
190
191struct FileInfoSpillStore {
192 temp_dir: TempDir,
193 batch_index: usize,
194}
195
196impl FileInfoSpillStore {
197 fn new() -> Self {
198 Self {
199 temp_dir: TempDir::new().expect("create spill dir"),
200 batch_index: 0,
201 }
202 }
203
204 fn spill(&mut self, files: Vec<FileInfo>) {
205 let path = self
206 .temp_dir
207 .path()
208 .join(format!("batch-{:06}.json.zst", self.batch_index));
209 self.batch_index += 1;
210
211 let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
212 let file = File::create(path).expect("create spill batch file");
213 let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
214 encoder
215 .write_all(&payload)
216 .expect("write spilled file batch");
217 encoder.finish().expect("finish spill encoder");
218 }
219
220 fn load_all(self) -> Vec<FileInfo> {
221 let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
222 .expect("read spill dir")
223 .filter_map(Result::ok)
224 .map(|entry| entry.path())
225 .collect();
226 paths.sort();
227
228 let mut files = Vec::new();
229 for path in paths {
230 let file = File::open(path).expect("open spill batch");
231 let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
232 let mut payload = Vec::new();
233 decoder.read_to_end(&mut payload).expect("read spill batch");
234 let mut batch: Vec<FileInfo> =
235 serde_json::from_slice(&payload).expect("decode spilled file batch");
236 files.append(&mut batch);
237 }
238 files
239 }
240}
241
242fn process_file(
243 path: &Path,
244 metadata: &fs::Metadata,
245 license_engine: Option<Arc<LicenseDetectionEngine>>,
246 license_options: LicenseScanOptions,
247 text_options: &TextDetectionOptions,
248) -> FileInfo {
249 let mut scan_errors: Vec<String> = vec![];
250 let mut file_info_builder = FileInfoBuilder::default();
251 let license_enabled = license_engine.is_some();
252
253 let started = Instant::now();
254
255 let mut generated_flag = None;
256 let mut is_source_file = false;
257 let mut cache_key_sha256 = None;
258 match extract_information_from_content(
259 &mut file_info_builder,
260 &mut scan_errors,
261 path,
262 license_engine,
263 license_options,
264 text_options,
265 ) {
266 Ok((is_generated, sha256, is_source)) => {
267 generated_flag = is_generated;
268 cache_key_sha256 = Some(sha256);
269 is_source_file = is_source;
270 }
271 Err(e) => scan_errors.push(e.to_string()),
272 };
273
274 if is_timeout_exceeded(started, text_options.timeout_seconds) {
275 scan_errors.push(format!(
276 "Processing interrupted due to timeout after {:.2} seconds",
277 text_options.timeout_seconds
278 ));
279 }
280
281 let mut file_info = file_info_builder
282 .name(path.file_name().unwrap().to_string_lossy().to_string())
283 .base_name(
284 path.file_stem()
285 .unwrap_or_default()
286 .to_string_lossy()
287 .to_string(),
288 )
289 .extension(
290 path.extension()
291 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
292 )
293 .path(path.to_string_lossy().to_string())
294 .file_type(FileType::File)
295 .size(metadata.len())
296 .date(
297 text_options
298 .collect_info
299 .then(|| get_creation_date(metadata))
300 .flatten(),
301 )
302 .scan_errors(scan_errors)
303 .build()
304 .expect("FileInformationBuild not completely initialized");
305
306 if text_options.collect_info {
307 file_info.is_source = Some(is_source_file);
308 }
309
310 if file_info.programming_language.as_deref() == Some("Go")
311 && is_go_non_production_source(path).unwrap_or(false)
312 {
313 file_info.is_source = Some(false);
314 }
315
316 if text_options.detect_generated {
317 file_info.is_generated = Some(generated_flag.unwrap_or(false));
318 }
319
320 if file_info.percentage_of_license_text.is_none() && license_enabled {
321 file_info.percentage_of_license_text = Some(0.0);
322 }
323
324 if let (Some(scan_results_dir), Some(sha256)) = (
325 text_options.scan_cache_dir.as_deref(),
326 cache_key_sha256.as_deref(),
327 ) && file_info.scan_errors.is_empty()
328 {
329 let findings = CachedScanFindings::from_file_info(&file_info);
330 let options_fingerprint =
331 scan_cache_fingerprint(text_options, license_options, license_enabled);
332 if let Err(err) =
333 write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
334 {
335 file_info
336 .scan_errors
337 .push(format!("Failed to write scan cache entry: {err}"));
338 }
339 }
340
341 file_info
342}
343
344fn extract_information_from_content(
345 file_info_builder: &mut FileInfoBuilder,
346 scan_errors: &mut Vec<String>,
347 path: &Path,
348 license_engine: Option<Arc<LicenseDetectionEngine>>,
349 license_options: LicenseScanOptions,
350 text_options: &TextDetectionOptions,
351) -> Result<(Option<bool>, String, bool), Error> {
352 let started = Instant::now();
353 let buffer = fs::read(path)?;
354 let license_enabled = license_engine.is_some();
355
356 if is_timeout_exceeded(started, text_options.timeout_seconds) {
357 return Err(Error::msg(format!(
358 "Timeout while reading file content (> {:.2}s)",
359 text_options.timeout_seconds
360 )));
361 }
362
363 let sha256 = calculate_sha256(&buffer);
364 let is_generated = text_options
365 .detect_generated
366 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
367 let classification = classify_file_info(path, &buffer);
368
369 if text_options.collect_info {
370 file_info_builder
371 .sha1(Some(calculate_sha1(&buffer)))
372 .md5(Some(calculate_md5(&buffer)))
373 .sha256(Some(sha256.clone()))
374 .programming_language(classification.programming_language.clone())
375 .mime_type(Some(classification.mime_type.clone()))
376 .file_type_label(Some(classification.file_type.clone()))
377 .sha1_git(Some(calculate_sha1_git(&buffer)))
378 .is_binary(Some(classification.is_binary))
379 .is_text(Some(classification.is_text))
380 .is_archive(Some(classification.is_archive))
381 .is_media(Some(classification.is_media))
382 .is_source(Some(classification.is_source))
383 .is_script(Some(classification.is_script))
384 .files_count(Some(0))
385 .dirs_count(Some(0))
386 .size_count(Some(0));
387 }
388
389 if should_skip_text_detection(path, &buffer) {
390 return Ok((is_generated, sha256, classification.is_source));
391 }
392
393 if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
394 let options_fingerprint =
395 scan_cache_fingerprint(text_options, license_options, license_enabled);
396 match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
397 Ok(Some(findings)) => {
398 file_info_builder
399 .package_data(findings.package_data)
400 .license_expression(findings.license_expression)
401 .license_detections(findings.license_detections)
402 .license_clues(findings.license_clues)
403 .percentage_of_license_text(findings.percentage_of_license_text)
404 .copyrights(findings.copyrights)
405 .holders(findings.holders)
406 .authors(findings.authors)
407 .emails(findings.emails)
408 .urls(findings.urls)
409 .programming_language(findings.programming_language);
410 return Ok((is_generated, sha256, classification.is_source));
411 }
412 Ok(None) => {}
413 Err(err) => {
414 scan_errors.push(format!("Failed to read scan cache for {:?}: {}", path, err));
415 }
416 }
417 }
418
419 if text_options.detect_packages {
422 let parse_result = try_parse_file(path).or_else(|| {
423 text_options
424 .detect_packages_in_compiled
425 .then(|| try_parse_compiled_bytes(&buffer))
426 .flatten()
427 });
428
429 if let Some(parse_result) = parse_result {
430 let packages = parse_result
431 .packages
432 .into_iter()
433 .filter(|package| {
434 let is_compiled_package = package
435 .datasource_id
436 .as_ref()
437 .is_some_and(is_compiled_datasource);
438 let is_system_package = package
439 .datasource_id
440 .as_ref()
441 .is_some_and(is_system_datasource);
442 if is_compiled_package {
443 text_options.detect_packages_in_compiled
444 } else if is_system_package {
445 text_options.detect_system_packages
446 } else {
447 text_options.detect_application_packages
448 }
449 })
450 .collect();
451 file_info_builder.package_data(packages);
452 scan_errors.extend(parse_result.scan_errors);
453 }
454 }
455
456 if is_timeout_exceeded(started, text_options.timeout_seconds) {
457 return Err(Error::msg(format!(
458 "Timeout while extracting package/text metadata (> {:.2}s)",
459 text_options.timeout_seconds
460 )));
461 }
462
463 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
464 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
465
466 if is_timeout_exceeded(started, text_options.timeout_seconds) {
467 return Err(Error::msg(format!(
468 "Timeout while extracting text content (> {:.2}s)",
469 text_options.timeout_seconds
470 )));
471 }
472
473 if text_content.is_empty() {
474 return Ok((is_generated, sha256, classification.is_source));
475 }
476
477 if text_options.detect_copyrights {
478 extract_copyright_information(
479 file_info_builder,
480 path,
481 &text_content,
482 text_options.timeout_seconds,
483 from_binary_strings,
484 );
485 }
486 extract_email_url_information(file_info_builder, &text_content, text_options);
487
488 if is_timeout_exceeded(started, text_options.timeout_seconds) {
489 return Err(Error::msg(format!(
490 "Timeout before license scan (> {:.2}s)",
491 text_options.timeout_seconds
492 )));
493 }
494 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
496 if let Some(sourcemap_content) =
497 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
498 {
499 sourcemap_content
500 } else {
501 text_content
502 }
503 } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
504 remove_verbatim_escape_sequences(&text_content)
505 } else {
506 text_content
507 };
508
509 extract_license_information(
510 file_info_builder,
511 scan_errors,
512 path,
513 text_content_for_license_detection,
514 license_engine,
515 license_options,
516 from_binary_strings,
517 )?;
518
519 Ok((is_generated, sha256, classification.is_source))
520}
521
522fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
523 timeout_seconds.is_finite()
524 && timeout_seconds > 0.0
525 && started.elapsed().as_secs_f64() > timeout_seconds
526}
527
528fn scan_cache_fingerprint(
529 text_options: &TextDetectionOptions,
530 license_options: LicenseScanOptions,
531 license_enabled: bool,
532) -> String {
533 format!(
534 "info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
535 text_options.collect_info,
536 text_options.detect_packages,
537 text_options.detect_application_packages,
538 text_options.detect_system_packages,
539 text_options.detect_packages_in_compiled,
540 text_options.detect_copyrights,
541 text_options.detect_emails,
542 text_options.detect_urls,
543 text_options.max_emails,
544 text_options.max_urls,
545 text_options.timeout_seconds,
546 license_enabled,
547 license_options.include_text,
548 license_options.include_text_diagnostics,
549 license_options.include_diagnostics,
550 license_options.unknown_licenses,
551 license_options.min_score,
552 )
553}
554
555fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
556 matches!(
557 datasource_id,
558 DatasourceId::AlpineInstalledDb
559 | DatasourceId::DebianDistrolessInstalledDb
560 | DatasourceId::DebianInstalledFilesList
561 | DatasourceId::DebianInstalledMd5Sums
562 | DatasourceId::DebianInstalledStatusDb
563 | DatasourceId::FreebsdCompactManifest
564 | DatasourceId::RpmInstalledDatabaseBdb
565 | DatasourceId::RpmInstalledDatabaseNdb
566 | DatasourceId::RpmInstalledDatabaseSqlite
567 | DatasourceId::RpmYumdb
568 )
569}
570
571fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
572 matches!(
573 datasource_id,
574 DatasourceId::GoBinary | DatasourceId::RustBinary
575 )
576}
577
578fn extract_copyright_information(
579 file_info_builder: &mut FileInfoBuilder,
580 path: &Path,
581 text_content: &str,
582 timeout_seconds: f64,
583 from_binary_strings: bool,
584) {
585 if copyright::is_credits_file(path) {
587 let author_detections = copyright::detect_credits_authors(text_content);
588 if !author_detections.is_empty() {
589 file_info_builder.authors(
590 author_detections
591 .into_iter()
592 .map(|a| Author {
593 author: a.author,
594 start_line: a.start_line,
595 end_line: a.end_line,
596 })
597 .collect(),
598 );
599 return;
600 }
601 }
602
603 let copyright_options = CopyrightDetectionOptions {
604 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
605 Some(Duration::from_secs_f64(timeout_seconds))
606 } else {
607 None
608 },
609 ..CopyrightDetectionOptions::default()
610 };
611
612 let (copyrights, holders, authors) =
613 copyright::detect_copyrights_with_options(text_content, ©right_options);
614 let (copyrights, holders, authors) = if from_binary_strings {
615 prune_binary_string_detections(copyrights, holders, authors)
616 } else {
617 (copyrights, holders, authors)
618 };
619
620 file_info_builder.copyrights(
621 copyrights
622 .into_iter()
623 .map(|c| Copyright {
624 copyright: c.copyright,
625 start_line: c.start_line,
626 end_line: c.end_line,
627 })
628 .collect::<Vec<Copyright>>(),
629 );
630 file_info_builder.holders(
631 holders
632 .into_iter()
633 .map(|h| Holder {
634 holder: h.holder,
635 start_line: h.start_line,
636 end_line: h.end_line,
637 })
638 .collect::<Vec<Holder>>(),
639 );
640 file_info_builder.authors(
641 authors
642 .into_iter()
643 .map(|a| Author {
644 author: a.author,
645 start_line: a.start_line,
646 end_line: a.end_line,
647 })
648 .collect::<Vec<Author>>(),
649 );
650}
651
652fn prune_binary_string_detections(
653 copyrights: Vec<CopyrightDetection>,
654 holders: Vec<HolderDetection>,
655 _authors: Vec<AuthorDetection>,
656) -> (
657 Vec<CopyrightDetection>,
658 Vec<HolderDetection>,
659 Vec<AuthorDetection>,
660) {
661 let kept_copyrights: Vec<CopyrightDetection> = copyrights
662 .into_iter()
663 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
664 .collect();
665
666 let kept_holders: Vec<HolderDetection> = holders
667 .into_iter()
668 .filter(|holder| {
669 kept_copyrights.iter().any(|copyright| {
670 ranges_overlap(
671 holder.start_line,
672 holder.end_line,
673 copyright.start_line,
674 copyright.end_line,
675 )
676 })
677 })
678 .collect();
679
680 (kept_copyrights, kept_holders, Vec::new())
681}
682
683fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
684 a_start <= b_end && b_start <= a_end
685}
686
687fn is_binary_string_copyright_candidate(text: &str) -> bool {
688 if has_explicit_copyright_marker(text) || contains_year(text) {
689 return true;
690 }
691
692 let lower = text.to_ascii_lowercase();
693 let Some(tail) = lower.strip_prefix("copyright") else {
694 return true;
695 };
696 let tail = tail.trim();
697 let alpha_tokens: Vec<&str> = tail
698 .split_whitespace()
699 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
700 .collect();
701
702 if alpha_tokens.len() <= 1 {
703 return true;
704 }
705
706 if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
707 return true;
708 }
709
710 alpha_tokens
711 .iter()
712 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
713}
714
715fn has_explicit_copyright_marker(text: &str) -> bool {
716 let lower = text.to_ascii_lowercase();
717 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
718}
719
720fn contains_year(text: &str) -> bool {
721 let bytes = text.as_bytes();
722 bytes.windows(4).any(|window| {
723 window.iter().all(|b| b.is_ascii_digit())
724 && matches!(window[0], b'1' | b'2')
725 && matches!(window[1], b'9' | b'0')
726 })
727}
728
729fn is_company_like_suffix(token: &str) -> bool {
730 matches!(
731 token.to_ascii_lowercase().as_str(),
732 "inc"
733 | "corp"
734 | "corporation"
735 | "co"
736 | "company"
737 | "ltd"
738 | "llc"
739 | "gmbh"
740 | "foundation"
741 | "project"
742 | "systems"
743 | "software"
744 | "technologies"
745 | "technology"
746 )
747}
748
749fn extract_email_url_information(
750 file_info_builder: &mut FileInfoBuilder,
751 text_content: &str,
752 text_options: &TextDetectionOptions,
753) {
754 if !text_options.detect_emails && !text_options.detect_urls {
755 return;
756 }
757
758 if text_options.detect_emails {
759 let config = DetectionConfig {
760 max_emails: text_options.max_emails,
761 max_urls: text_options.max_urls,
762 unique: false,
763 };
764 let emails = finder::find_emails(text_content, &config)
765 .into_iter()
766 .map(|d| OutputEmail {
767 email: d.email,
768 start_line: d.start_line,
769 end_line: d.end_line,
770 })
771 .collect::<Vec<_>>();
772 file_info_builder.emails(emails);
773 }
774
775 if text_options.detect_urls {
776 let config = DetectionConfig {
777 max_emails: text_options.max_emails,
778 max_urls: text_options.max_urls,
779 unique: true,
780 };
781 let urls = finder::find_urls(text_content, &config)
782 .into_iter()
783 .map(|d| OutputURL {
784 url: d.url,
785 start_line: d.start_line,
786 end_line: d.end_line,
787 })
788 .collect::<Vec<_>>();
789 file_info_builder.urls(urls);
790 }
791}
792
793fn extract_license_information(
794 file_info_builder: &mut FileInfoBuilder,
795 scan_errors: &mut Vec<String>,
796 path: &Path,
797 text_content: String,
798 license_engine: Option<Arc<LicenseDetectionEngine>>,
799 license_options: LicenseScanOptions,
800 from_binary_strings: bool,
801) -> Result<(), Error> {
802 let Some(engine) = license_engine else {
803 return Ok(());
804 };
805
806 let detection_result = if license_options.min_score == 0 {
807 engine.detect_with_kind_and_source(
808 &text_content,
809 license_options.unknown_licenses,
810 from_binary_strings,
811 &path.to_string_lossy(),
812 )
813 } else {
814 engine.detect_with_kind_and_source_with_score(
815 &text_content,
816 license_options.unknown_licenses,
817 from_binary_strings,
818 &path.to_string_lossy(),
819 license_options.min_score as f32,
820 )
821 };
822
823 match detection_result {
824 Ok(detections) => {
825 let query =
826 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
827 let mut model_detections = Vec::new();
828 let mut model_clues = Vec::new();
829
830 for detection in &detections {
831 let (public_detection, clue_matches) = convert_detection_to_model(
832 detection,
833 license_options,
834 &text_content,
835 query.as_ref(),
836 );
837
838 if let Some(public_detection) = public_detection {
839 model_detections.push(public_detection);
840 }
841
842 model_clues.extend(clue_matches);
843 }
844
845 if !model_detections.is_empty() {
846 let expressions: Vec<String> = model_detections
847 .iter()
848 .filter(|d| !d.license_expression_spdx.is_empty())
849 .map(|d| d.license_expression_spdx.clone())
850 .collect();
851
852 if !expressions.is_empty() {
853 let combined = crate::utils::spdx::combine_license_expressions(expressions);
854 if let Some(expr) = combined {
855 file_info_builder.license_expression(Some(expr));
856 }
857 }
858 }
859
860 file_info_builder.license_detections(model_detections);
861 file_info_builder.license_clues(model_clues);
862 file_info_builder.percentage_of_license_text(
863 query
864 .as_ref()
865 .map(|query| compute_percentage_of_license_text(query, &detections)),
866 );
867 }
868 Err(e) => {
869 scan_errors.push(format!("License detection failed: {}", e));
870 }
871 }
872
873 Ok(())
874}
875
876fn convert_detection_to_model(
877 detection: &crate::license_detection::LicenseDetection,
878 license_options: LicenseScanOptions,
879 text_content: &str,
880 query: Option<&Query<'_>>,
881) -> (Option<LicenseDetection>, Vec<Match>) {
882 let matches: Vec<Match> = detection
883 .matches
884 .iter()
885 .map(|m| convert_match_to_model(m, license_options, text_content, query))
886 .collect();
887
888 if let Some(license_expression) = detection.license_expression.clone() {
889 (
890 Some(LicenseDetection {
891 license_expression,
892 license_expression_spdx: detection
893 .license_expression_spdx
894 .clone()
895 .unwrap_or_default(),
896 matches,
897 detection_log: if license_options.include_diagnostics {
898 detection.detection_log.clone()
899 } else {
900 Vec::new()
901 },
902 identifier: detection.identifier.clone(),
903 }),
904 Vec::new(),
905 )
906 } else {
907 (None, matches)
908 }
909}
910
911fn convert_match_to_model(
912 m: &crate::license_detection::models::LicenseMatch,
913 license_options: LicenseScanOptions,
914 text_content: &str,
915 query: Option<&Query<'_>>,
916) -> Match {
917 let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
918 let rule_url = if m.rule_url.is_empty() {
919 None
920 } else {
921 Some(m.rule_url.clone())
922 };
923 let matched_text = if license_options.include_text {
924 m.matched_text.clone().or_else(|| {
925 Some(crate::license_detection::query::matched_text_from_text(
926 text_content,
927 m.start_line,
928 m.end_line,
929 ))
930 })
931 } else {
932 None
933 };
934 let matched_text_diagnostics = if license_options.include_text_diagnostics {
935 query.map(|query| matched_text_diagnostics_from_match(query, m))
936 } else {
937 None
938 };
939 Match {
940 license_expression: m.license_expression.clone(),
941 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
942 from_file: m.from_file.clone(),
943 start_line: m.start_line,
944 end_line: m.end_line,
945 matcher: Some(m.matcher.to_string()),
946 score: output_metric(m.score),
947 matched_length: Some(m.matched_length),
948 match_coverage: Some(output_metric(m.coverage())),
949 rule_relevance: Some(m.rule_relevance as usize),
950 rule_identifier: Some(m.rule_identifier.clone()),
951 rule_url,
952 matched_text,
953 referenced_filenames: m.referenced_filenames.clone(),
954 matched_text_diagnostics,
955 }
956}
957
958fn compute_percentage_of_license_text(
959 query: &Query<'_>,
960 detections: &[crate::license_detection::LicenseDetection],
961) -> f64 {
962 let matched_positions: std::collections::HashSet<usize> = detections
963 .iter()
964 .flat_map(|detection| detection.matches.iter())
965 .flat_map(InternalLicenseMatch::qspan)
966 .collect();
967
968 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
969 if query_tokens_length == 0 {
970 return 0.0;
971 }
972
973 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
974 (percentage * 100.0).round() / 100.0
975}
976
977fn matched_text_diagnostics_from_match(
978 query: &Query<'_>,
979 license_match: &InternalLicenseMatch,
980) -> String {
981 let matched_positions: std::collections::HashSet<usize> =
982 license_match.qspan().into_iter().collect();
983 let Some(start_pos) = matched_positions.iter().min().copied() else {
984 return crate::license_detection::query::matched_text_from_text(
985 &query.text,
986 license_match.start_line,
987 license_match.end_line,
988 );
989 };
990 let Some(end_pos) = matched_positions.iter().max().copied() else {
991 return crate::license_detection::query::matched_text_from_text(
992 &query.text,
993 license_match.start_line,
994 license_match.end_line,
995 );
996 };
997
998 crate::license_detection::query::matched_text_diagnostics_from_text(
999 &query.text,
1000 query,
1001 &matched_positions,
1002 start_pos,
1003 end_pos,
1004 license_match.start_line,
1005 license_match.end_line,
1006 )
1007}
1008
1009fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1010 is_pem_certificate_file(path, buffer)
1011}
1012
1013fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1014 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1015 return Ok(false);
1016 }
1017
1018 if path
1019 .file_name()
1020 .and_then(|name| name.to_str())
1021 .is_some_and(|name| name.ends_with("_test.go"))
1022 {
1023 return Ok(true);
1024 }
1025
1026 let content = fs::read_to_string(path)?;
1027 Ok(content.lines().take(10).any(|line| {
1028 let trimmed = line.trim();
1029 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1030 && trimmed.split_whitespace().any(|token| token == "test")
1031 }))
1032}
1033
1034fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1035 let prefix_len = buffer.len().min(8192);
1036 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1037 let trimmed_lines: Vec<&str> = prefix
1038 .lines()
1039 .map(str::trim)
1040 .filter(|line| !line.is_empty())
1041 .take(64)
1042 .collect();
1043
1044 PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
1045 trimmed_lines.iter().any(|line| line == begin)
1046 && trimmed_lines.iter().any(|line| line == end)
1047 })
1048}
1049
1050fn process_directory(
1051 path: &Path,
1052 _metadata: &fs::Metadata,
1053 collect_info: bool,
1054 license_enabled: bool,
1055) -> FileInfo {
1056 let name = path
1057 .file_name()
1058 .unwrap_or_default()
1059 .to_string_lossy()
1060 .to_string();
1061 let base_name = name.clone(); FileInfo {
1064 name,
1065 base_name,
1066 extension: "".to_string(),
1067 path: path.to_string_lossy().to_string(),
1068 file_type: FileType::Directory,
1069 mime_type: None,
1070 file_type_label: None,
1071 size: 0,
1072 date: None,
1073 sha1: None,
1074 md5: None,
1075 sha256: None,
1076 sha1_git: None,
1077 programming_language: None,
1078 package_data: Vec::new(),
1079 license_expression: None,
1080 license_detections: Vec::new(),
1081 license_clues: Vec::new(),
1082 percentage_of_license_text: license_enabled.then_some(0.0),
1083 copyrights: Vec::new(),
1084 holders: Vec::new(),
1085 authors: Vec::new(),
1086 emails: Vec::new(),
1087 urls: Vec::new(),
1088 for_packages: Vec::new(),
1089 scan_errors: Vec::new(),
1090 license_policy: None,
1091 is_binary: collect_info.then_some(false),
1092 is_text: collect_info.then_some(false),
1093 is_archive: collect_info.then_some(false),
1094 is_media: collect_info.then_some(false),
1095 is_source: collect_info.then_some(false),
1096 is_script: collect_info.then_some(false),
1097 files_count: collect_info.then_some(0),
1098 dirs_count: collect_info.then_some(0),
1099 size_count: collect_info.then_some(0),
1100 source_count: None,
1101 is_legal: false,
1102 is_manifest: false,
1103 is_readme: false,
1104 is_top_level: false,
1105 is_key_file: false,
1106 is_community: false,
1107 is_generated: None,
1108 facets: vec![],
1109 tallies: None,
1110 }
1111}
1112
1113#[cfg(test)]
1114mod tests {
1115 use super::{
1116 compute_percentage_of_license_text, convert_detection_to_model,
1117 is_go_non_production_source, scan_cache_fingerprint,
1118 };
1119 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1120 use crate::license_detection::index::LicenseIndex;
1121 use crate::license_detection::index::dictionary::TokenDictionary;
1122 use crate::license_detection::models::{LicenseMatch, MatcherKind, RuleKind};
1123 use crate::license_detection::query::Query;
1124 use crate::scanner::LicenseScanOptions;
1125 use std::fs;
1126 use tempfile::tempdir;
1127
1128 fn make_internal_match(rule_url: &str) -> LicenseMatch {
1129 LicenseMatch {
1130 rid: 0,
1131 license_expression: "mit".to_string(),
1132 license_expression_spdx: Some("MIT".to_string()),
1133 from_file: None,
1134 start_line: 1,
1135 end_line: 1,
1136 start_token: 0,
1137 end_token: 1,
1138 matcher: MatcherKind::Hash,
1139 score: 1.0,
1140 matched_length: 3,
1141 rule_length: 3,
1142 match_coverage: 100.0,
1143 rule_relevance: 100,
1144 rule_identifier: "mit.LICENSE".to_string(),
1145 rule_url: rule_url.to_string(),
1146 matched_text: Some("MIT".to_string()),
1147 referenced_filenames: None,
1148 rule_kind: RuleKind::Text,
1149 is_from_license: true,
1150 matched_token_positions: None,
1151 hilen: 3,
1152 rule_start_token: 0,
1153 qspan_positions: None,
1154 ispan_positions: None,
1155 hispan_positions: None,
1156 candidate_resemblance: 0.0,
1157 candidate_containment: 0.0,
1158 }
1159 }
1160
1161 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1162 InternalLicenseDetection {
1163 license_expression: Some("mit".to_string()),
1164 license_expression_spdx: Some("MIT".to_string()),
1165 matches: vec![make_internal_match(rule_url)],
1166 detection_log: vec![],
1167 identifier: Some("mit-test".to_string()),
1168 file_regions: Vec::new(),
1169 }
1170 }
1171
1172 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1173 let dictionary = TokenDictionary::new_with_legalese(entries);
1174 let mut index = LicenseIndex::new(dictionary);
1175 index.len_legalese = len_legalese;
1176 index
1177 }
1178
1179 #[test]
1180 fn test_convert_detection_to_model_preserves_rule_url() {
1181 let detection = make_detection(
1182 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1183 );
1184
1185 let (converted, clues) =
1186 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1187 let converted = converted.expect("detection should convert");
1188
1189 assert_eq!(
1190 converted.matches[0].rule_url.as_deref(),
1191 Some(
1192 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1193 )
1194 );
1195 assert!(clues.is_empty());
1196 }
1197
1198 #[test]
1199 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1200 let detection = make_detection("");
1201
1202 let (converted, clues) =
1203 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1204 let converted = converted.expect("detection should convert");
1205
1206 assert_eq!(converted.matches[0].rule_url, None);
1207 assert!(clues.is_empty());
1208 }
1209
1210 #[test]
1211 fn test_convert_detection_to_model_rounds_match_coverage() {
1212 let mut detection = make_detection("");
1213 detection.matches[0].score = 81.82;
1214 detection.matches[0].match_coverage = 33.334;
1215
1216 let (converted, clues) =
1217 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1218 let converted = converted.expect("detection should convert");
1219
1220 assert_eq!(converted.matches[0].score, 81.82);
1221 assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1222 assert!(clues.is_empty());
1223 }
1224
1225 #[test]
1226 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1227 let mut detection = make_detection(
1228 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1229 );
1230 detection.license_expression = None;
1231 detection.license_expression_spdx = None;
1232 detection.identifier = None;
1233 detection.matches[0].license_expression = "unknown-license-reference".to_string();
1234 detection.matches[0].license_expression_spdx =
1235 Some("LicenseRef-scancode-unknown-license-reference".to_string());
1236 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1237 detection.matches[0].rule_kind = RuleKind::Clue;
1238
1239 let (converted, clues) = convert_detection_to_model(
1240 &detection,
1241 LicenseScanOptions {
1242 include_text: true,
1243 min_score: 0,
1244 ..LicenseScanOptions::default()
1245 },
1246 "clue text",
1247 None,
1248 );
1249
1250 assert!(converted.is_none());
1251 assert_eq!(clues.len(), 1);
1252 assert_eq!(clues[0].license_expression, "unknown-license-reference");
1253 assert_eq!(
1254 clues[0].license_expression_spdx,
1255 "LicenseRef-scancode-unknown-license-reference"
1256 );
1257 assert_eq!(
1258 clues[0].rule_identifier.as_deref(),
1259 Some("license-clue_1.RULE")
1260 );
1261 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1262 assert_eq!(clues[0].matched_text_diagnostics, None);
1263 }
1264
1265 #[test]
1266 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1267 let text = concat!(
1268 "Reproduction and distribution of this file, with or without modification, are\n",
1269 "permitted in any medium without royalties provided the copyright notice\n",
1270 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1271 );
1272 let index = create_test_index(
1273 &[
1274 ("reproduction", 0),
1275 ("distribution", 1),
1276 ("file", 2),
1277 ("without", 3),
1278 ("modification", 4),
1279 ("permitted", 5),
1280 ("medium", 6),
1281 ("royalties", 7),
1282 ("provided", 8),
1283 ("copyright", 9),
1284 ("notice", 10),
1285 ("preserved", 11),
1286 ("offered", 12),
1287 ("warranties", 13),
1288 ],
1289 14,
1290 );
1291 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1292 let mut detection = make_detection(
1293 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1294 );
1295 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1296 detection.matches[0].license_expression = "fsf-ap".to_string();
1297 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1298 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1299 detection.matches[0].matched_text = None;
1300 detection.matches[0].start_line = 1;
1301 detection.matches[0].end_line = 3;
1302 detection.matches[0].start_token = 0;
1303 detection.matches[0].end_token = query.tokens.len();
1304 detection.matches[0].qspan_positions = Some(
1305 query
1306 .tokens
1307 .iter()
1308 .enumerate()
1309 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1310 .collect(),
1311 );
1312 detection.identifier = Some("fsf_ap-test".to_string());
1313
1314 let (converted, clues) = convert_detection_to_model(
1315 &detection,
1316 LicenseScanOptions {
1317 include_text: true,
1318 include_text_diagnostics: true,
1319 include_diagnostics: true,
1320 unknown_licenses: false,
1321 min_score: 0,
1322 },
1323 text,
1324 Some(&query),
1325 );
1326 let converted = converted.expect("detection should convert");
1327
1328 assert!(clues.is_empty());
1329 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1330 assert_eq!(
1331 converted.matches[0].matched_text.as_deref(),
1332 Some(text.trim_end())
1333 );
1334 let diagnostics = converted.matches[0]
1335 .matched_text_diagnostics
1336 .as_deref()
1337 .expect("diagnostics should be present");
1338 assert!(diagnostics.contains('['));
1339 assert!(diagnostics.contains(']'));
1340 assert_ne!(diagnostics, text.trim_end());
1341 }
1342
1343 #[test]
1344 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1345 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1346 let text = "alpha MIT omega";
1347 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1348 let mut detection = make_detection("");
1349 detection.matches[0].qspan_positions = Some(vec![1]);
1350 detection.matches[0].start_token = 1;
1351 detection.matches[0].end_token = 2;
1352
1353 let percentage = compute_percentage_of_license_text(&query, &[detection]);
1354
1355 assert_eq!(percentage, 33.33);
1356 }
1357
1358 #[test]
1359 fn test_scan_cache_fingerprint_changes_with_license_score() {
1360 let text_options = crate::scanner::TextDetectionOptions::default();
1361 let default_fingerprint = scan_cache_fingerprint(
1362 &text_options,
1363 LicenseScanOptions {
1364 min_score: 0,
1365 ..LicenseScanOptions::default()
1366 },
1367 true,
1368 );
1369 let filtered_fingerprint = scan_cache_fingerprint(
1370 &text_options,
1371 LicenseScanOptions {
1372 min_score: 70,
1373 ..LicenseScanOptions::default()
1374 },
1375 true,
1376 );
1377
1378 assert_ne!(default_fingerprint, filtered_fingerprint);
1379 }
1380
1381 #[test]
1382 fn test_is_go_non_production_source_for_test_filename() {
1383 let temp_dir = tempdir().unwrap();
1384 let path = temp_dir.path().join("scanner_test.go");
1385 fs::write(&path, "package scanner\n").unwrap();
1386
1387 assert!(is_go_non_production_source(&path).unwrap());
1388 }
1389
1390 #[test]
1391 fn test_is_go_non_production_source_for_build_tag() {
1392 let temp_dir = tempdir().unwrap();
1393 let path = temp_dir.path().join("scanner.go");
1394 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1395
1396 assert!(is_go_non_production_source(&path).unwrap());
1397 }
1398
1399 #[test]
1400 fn test_is_go_non_production_source_for_regular_go_file() {
1401 let temp_dir = tempdir().unwrap();
1402 let path = temp_dir.path().join("scanner.go");
1403 fs::write(&path, "package scanner\n").unwrap();
1404
1405 assert!(!is_go_non_production_source(&path).unwrap());
1406 }
1407}