1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::try_parse_file;
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
4use crate::utils::language::detect_language;
5use crate::utils::text::{is_source, remove_verbatim_escape_sequences};
6use anyhow::Error;
7use log::warn;
8use mime_guess::from_path;
9use rayon::prelude::*;
10use std::fs::{self};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
16use crate::copyright::{
17 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::models::{
21 Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
22 OutputEmail, OutputURL,
23};
24use crate::progress::ScanProgress;
25use crate::scanner::collect::CollectedPaths;
26use crate::scanner::{ProcessResult, TextDetectionOptions};
27use crate::utils::file::{ExtractedTextKind, extract_text_for_detection, get_creation_date};
28use crate::utils::generated::generated_code_hints_from_bytes;
29
30const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
31 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
32 (
33 "-----BEGIN TRUSTED CERTIFICATE-----",
34 "-----END TRUSTED CERTIFICATE-----",
35 ),
36];
37
38pub fn process_collected(
39 collected: &CollectedPaths,
40 progress: Arc<ScanProgress>,
41 license_engine: Option<Arc<LicenseDetectionEngine>>,
42 include_text: bool,
43 text_options: &TextDetectionOptions,
44) -> ProcessResult {
45 let mut all_files: Vec<FileInfo> = collected
46 .files
47 .par_iter()
48 .map(|(path, metadata)| {
49 let file_entry = process_file(
50 path,
51 metadata,
52 license_engine.clone(),
53 include_text,
54 text_options,
55 );
56 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
57 file_entry
58 })
59 .collect();
60
61 for (path, metadata) in &collected.directories {
62 all_files.push(process_directory(path, metadata));
63 }
64
65 ProcessResult {
66 files: all_files,
67 excluded_count: collected.excluded_count,
68 }
69}
70
71fn process_file(
72 path: &Path,
73 metadata: &fs::Metadata,
74 license_engine: Option<Arc<LicenseDetectionEngine>>,
75 include_text: bool,
76 text_options: &TextDetectionOptions,
77) -> FileInfo {
78 let mut scan_errors: Vec<String> = vec![];
79 let mut file_info_builder = FileInfoBuilder::default();
80
81 let started = Instant::now();
82
83 let mut generated_flag = None;
84 match extract_information_from_content(
85 &mut file_info_builder,
86 path,
87 license_engine,
88 include_text,
89 text_options,
90 ) {
91 Ok(is_generated) => generated_flag = is_generated,
92 Err(e) => scan_errors.push(e.to_string()),
93 };
94
95 if is_timeout_exceeded(started, text_options.timeout_seconds) {
96 scan_errors.push(format!(
97 "Processing interrupted due to timeout after {:.2} seconds",
98 text_options.timeout_seconds
99 ));
100 }
101
102 let mut file_info = file_info_builder
103 .name(path.file_name().unwrap().to_string_lossy().to_string())
104 .base_name(
105 path.file_stem()
106 .unwrap_or_default()
107 .to_string_lossy()
108 .to_string(),
109 )
110 .extension(
111 path.extension()
112 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
113 )
114 .path(path.to_string_lossy().to_string())
115 .file_type(FileType::File)
116 .mime_type(Some(
117 from_path(path)
118 .first_or_octet_stream()
119 .essence_str()
120 .to_string(),
121 ))
122 .size(metadata.len())
123 .date(get_creation_date(metadata))
124 .scan_errors(scan_errors)
125 .build()
126 .expect("FileInformationBuild not completely initialized");
127
128 if file_info.programming_language.as_deref() == Some("Go")
129 && is_go_non_production_source(path).unwrap_or(false)
130 {
131 file_info.is_source = Some(false);
132 }
133
134 if text_options.detect_generated {
135 file_info.is_generated = Some(generated_flag.unwrap_or(false));
136 }
137
138 if let (Some(scan_results_dir), Some(sha256)) = (
139 text_options.scan_cache_dir.as_deref(),
140 file_info.sha256.as_deref(),
141 ) && file_info.scan_errors.is_empty()
142 {
143 let findings = CachedScanFindings::from_file_info(&file_info);
144 let options_fingerprint = scan_cache_fingerprint(text_options);
145 if let Err(err) =
146 write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
147 {
148 file_info
149 .scan_errors
150 .push(format!("Failed to write scan cache entry: {err}"));
151 }
152 }
153
154 file_info
155}
156
157fn extract_information_from_content(
158 file_info_builder: &mut FileInfoBuilder,
159 path: &Path,
160 license_engine: Option<Arc<LicenseDetectionEngine>>,
161 include_text: bool,
162 text_options: &TextDetectionOptions,
163) -> Result<Option<bool>, Error> {
164 let started = Instant::now();
165 let buffer = fs::read(path)?;
166
167 if is_timeout_exceeded(started, text_options.timeout_seconds) {
168 return Err(Error::msg(format!(
169 "Timeout while reading file content (> {:.2}s)",
170 text_options.timeout_seconds
171 )));
172 }
173
174 let sha256 = calculate_sha256(&buffer);
175 let is_generated = text_options
176 .detect_generated
177 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
178
179 file_info_builder
180 .sha1(Some(calculate_sha1(&buffer)))
181 .md5(Some(calculate_md5(&buffer)))
182 .sha256(Some(sha256.clone()))
183 .programming_language(Some(detect_language(path, &buffer)));
184
185 if should_skip_text_detection(path, &buffer) {
186 return Ok(is_generated);
187 }
188
189 if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
190 let options_fingerprint = scan_cache_fingerprint(text_options);
191 match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
192 Ok(Some(findings)) => {
193 file_info_builder
194 .package_data(findings.package_data)
195 .license_expression(findings.license_expression)
196 .license_detections(findings.license_detections)
197 .copyrights(findings.copyrights)
198 .holders(findings.holders)
199 .authors(findings.authors)
200 .emails(findings.emails)
201 .urls(findings.urls)
202 .programming_language(findings.programming_language);
203 return Ok(is_generated);
204 }
205 Ok(None) => {}
206 Err(err) => {
207 warn!("Failed to read scan cache for {:?}: {}", path, err);
208 }
209 }
210 }
211
212 if text_options.detect_packages
215 && let Some(package_data) = try_parse_file(path)
216 {
217 file_info_builder.package_data(package_data);
218 }
219
220 if is_timeout_exceeded(started, text_options.timeout_seconds) {
221 return Err(Error::msg(format!(
222 "Timeout while extracting package/text metadata (> {:.2}s)",
223 text_options.timeout_seconds
224 )));
225 }
226
227 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
228 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
229
230 if is_timeout_exceeded(started, text_options.timeout_seconds) {
231 return Err(Error::msg(format!(
232 "Timeout while extracting text content (> {:.2}s)",
233 text_options.timeout_seconds
234 )));
235 }
236
237 if text_content.is_empty() {
238 return Ok(is_generated);
239 }
240
241 if text_options.detect_copyrights {
242 extract_copyright_information(
243 file_info_builder,
244 path,
245 &text_content,
246 text_options.timeout_seconds,
247 from_binary_strings,
248 );
249 }
250 extract_email_url_information(file_info_builder, &text_content, text_options);
251
252 if is_timeout_exceeded(started, text_options.timeout_seconds) {
253 return Err(Error::msg(format!(
254 "Timeout before license scan (> {:.2}s)",
255 text_options.timeout_seconds
256 )));
257 }
258 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
260 if let Some(sourcemap_content) =
261 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
262 {
263 sourcemap_content
264 } else {
265 text_content
266 }
267 } else if is_source(path) {
268 remove_verbatim_escape_sequences(&text_content)
269 } else {
270 text_content
271 };
272
273 extract_license_information(
274 file_info_builder,
275 text_content_for_license_detection,
276 license_engine,
277 include_text,
278 from_binary_strings,
279 )?;
280
281 Ok(is_generated)
282}
283
284fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
285 timeout_seconds.is_finite()
286 && timeout_seconds > 0.0
287 && started.elapsed().as_secs_f64() > timeout_seconds
288}
289
290fn scan_cache_fingerprint(text_options: &TextDetectionOptions) -> String {
291 format!(
292 "packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6}",
293 text_options.detect_packages,
294 text_options.detect_copyrights,
295 text_options.detect_emails,
296 text_options.detect_urls,
297 text_options.max_emails,
298 text_options.max_urls,
299 text_options.timeout_seconds,
300 )
301}
302
303fn extract_copyright_information(
304 file_info_builder: &mut FileInfoBuilder,
305 path: &Path,
306 text_content: &str,
307 timeout_seconds: f64,
308 from_binary_strings: bool,
309) {
310 if copyright::is_credits_file(path) {
312 let author_detections = copyright::detect_credits_authors(text_content);
313 if !author_detections.is_empty() {
314 file_info_builder.authors(
315 author_detections
316 .into_iter()
317 .map(|a| Author {
318 author: a.author,
319 start_line: a.start_line,
320 end_line: a.end_line,
321 })
322 .collect(),
323 );
324 return;
325 }
326 }
327
328 let copyright_options = CopyrightDetectionOptions {
329 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
330 Some(Duration::from_secs_f64(timeout_seconds))
331 } else {
332 None
333 },
334 ..CopyrightDetectionOptions::default()
335 };
336
337 let (copyrights, holders, authors) =
338 copyright::detect_copyrights_with_options(text_content, ©right_options);
339 let (copyrights, holders, authors) = if from_binary_strings {
340 prune_binary_string_detections(copyrights, holders, authors)
341 } else {
342 (copyrights, holders, authors)
343 };
344
345 file_info_builder.copyrights(
346 copyrights
347 .into_iter()
348 .map(|c| Copyright {
349 copyright: c.copyright,
350 start_line: c.start_line,
351 end_line: c.end_line,
352 })
353 .collect::<Vec<Copyright>>(),
354 );
355 file_info_builder.holders(
356 holders
357 .into_iter()
358 .map(|h| Holder {
359 holder: h.holder,
360 start_line: h.start_line,
361 end_line: h.end_line,
362 })
363 .collect::<Vec<Holder>>(),
364 );
365 file_info_builder.authors(
366 authors
367 .into_iter()
368 .map(|a| Author {
369 author: a.author,
370 start_line: a.start_line,
371 end_line: a.end_line,
372 })
373 .collect::<Vec<Author>>(),
374 );
375}
376
377fn prune_binary_string_detections(
378 copyrights: Vec<CopyrightDetection>,
379 holders: Vec<HolderDetection>,
380 _authors: Vec<AuthorDetection>,
381) -> (
382 Vec<CopyrightDetection>,
383 Vec<HolderDetection>,
384 Vec<AuthorDetection>,
385) {
386 let kept_copyrights: Vec<CopyrightDetection> = copyrights
387 .into_iter()
388 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
389 .collect();
390
391 let kept_holders: Vec<HolderDetection> = holders
392 .into_iter()
393 .filter(|holder| {
394 kept_copyrights.iter().any(|copyright| {
395 ranges_overlap(
396 holder.start_line,
397 holder.end_line,
398 copyright.start_line,
399 copyright.end_line,
400 )
401 })
402 })
403 .collect();
404
405 (kept_copyrights, kept_holders, Vec::new())
406}
407
408fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
409 a_start <= b_end && b_start <= a_end
410}
411
412fn is_binary_string_copyright_candidate(text: &str) -> bool {
413 if has_explicit_copyright_marker(text) || contains_year(text) {
414 return true;
415 }
416
417 let lower = text.to_ascii_lowercase();
418 let Some(tail) = lower.strip_prefix("copyright") else {
419 return true;
420 };
421 let tail = tail.trim();
422 let alpha_tokens: Vec<&str> = tail
423 .split_whitespace()
424 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
425 .collect();
426
427 if alpha_tokens.len() <= 1 {
428 return true;
429 }
430
431 if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
432 return true;
433 }
434
435 alpha_tokens
436 .iter()
437 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
438}
439
440fn has_explicit_copyright_marker(text: &str) -> bool {
441 let lower = text.to_ascii_lowercase();
442 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
443}
444
445fn contains_year(text: &str) -> bool {
446 let bytes = text.as_bytes();
447 bytes.windows(4).any(|window| {
448 window.iter().all(|b| b.is_ascii_digit())
449 && matches!(window[0], b'1' | b'2')
450 && matches!(window[1], b'9' | b'0')
451 })
452}
453
454fn is_company_like_suffix(token: &str) -> bool {
455 matches!(
456 token.to_ascii_lowercase().as_str(),
457 "inc"
458 | "corp"
459 | "corporation"
460 | "co"
461 | "company"
462 | "ltd"
463 | "llc"
464 | "gmbh"
465 | "foundation"
466 | "project"
467 | "systems"
468 | "software"
469 | "technologies"
470 | "technology"
471 )
472}
473
474fn extract_email_url_information(
475 file_info_builder: &mut FileInfoBuilder,
476 text_content: &str,
477 text_options: &TextDetectionOptions,
478) {
479 if !text_options.detect_emails && !text_options.detect_urls {
480 return;
481 }
482
483 if text_options.detect_emails {
484 let config = DetectionConfig {
485 max_emails: text_options.max_emails,
486 max_urls: text_options.max_urls,
487 unique: false,
488 };
489 let emails = finder::find_emails(text_content, &config)
490 .into_iter()
491 .map(|d| OutputEmail {
492 email: d.email,
493 start_line: d.start_line,
494 end_line: d.end_line,
495 })
496 .collect::<Vec<_>>();
497 file_info_builder.emails(emails);
498 }
499
500 if text_options.detect_urls {
501 let config = DetectionConfig {
502 max_emails: text_options.max_emails,
503 max_urls: text_options.max_urls,
504 unique: true,
505 };
506 let urls = finder::find_urls(text_content, &config)
507 .into_iter()
508 .map(|d| OutputURL {
509 url: d.url,
510 start_line: d.start_line,
511 end_line: d.end_line,
512 })
513 .collect::<Vec<_>>();
514 file_info_builder.urls(urls);
515 }
516}
517
518fn extract_license_information(
519 file_info_builder: &mut FileInfoBuilder,
520 text_content: String,
521 license_engine: Option<Arc<LicenseDetectionEngine>>,
522 include_text: bool,
523 from_binary_strings: bool,
524) -> Result<(), Error> {
525 let Some(engine) = license_engine else {
526 return Ok(());
527 };
528
529 match engine.detect_with_kind(&text_content, false, from_binary_strings) {
530 Ok(detections) => {
531 let model_detections: Vec<LicenseDetection> = detections
532 .into_iter()
533 .filter_map(|d| convert_detection_to_model(d, include_text, &text_content))
534 .collect();
535
536 if !model_detections.is_empty() {
537 let expressions: Vec<String> = model_detections
538 .iter()
539 .filter(|d| !d.license_expression_spdx.is_empty())
540 .map(|d| d.license_expression_spdx.clone())
541 .collect();
542
543 if !expressions.is_empty() {
544 let combined = crate::utils::spdx::combine_license_expressions(expressions);
545 if let Some(expr) = combined {
546 file_info_builder.license_expression(Some(expr));
547 }
548 }
549 }
550
551 file_info_builder.license_detections(model_detections);
552 }
553 Err(e) => {
554 warn!("License detection failed: {}", e);
555 }
556 }
557
558 Ok(())
559}
560
561fn convert_detection_to_model(
562 detection: crate::license_detection::LicenseDetection,
563 include_text: bool,
564 text_content: &str,
565) -> Option<LicenseDetection> {
566 let license_expression = detection.license_expression?;
567 let license_expression_spdx = detection.license_expression_spdx.unwrap_or_default();
568
569 let matches: Vec<Match> = detection
570 .matches
571 .into_iter()
572 .map(|m| {
573 let matched_text = if include_text {
574 m.matched_text.or_else(|| {
575 Some(crate::license_detection::query::matched_text_from_text(
576 text_content,
577 m.start_line,
578 m.end_line,
579 ))
580 })
581 } else {
582 None
583 };
584 Match {
585 license_expression: m.license_expression,
586 license_expression_spdx: m.license_expression_spdx.unwrap_or_default(),
587 from_file: m.from_file,
588 start_line: m.start_line,
589 end_line: m.end_line,
590 matcher: Some(m.matcher.to_string()),
591 score: m.score as f64,
592 matched_length: Some(m.matched_length),
593 match_coverage: Some(m.match_coverage as f64),
594 rule_relevance: Some(m.rule_relevance as usize),
595 rule_identifier: Some(m.rule_identifier),
596 rule_url: Some(m.rule_url),
597 matched_text,
598 }
599 })
600 .collect();
601
602 Some(LicenseDetection {
603 license_expression,
604 license_expression_spdx,
605 matches,
606 identifier: detection.identifier,
607 })
608}
609
610fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
611 is_pem_certificate_file(path, buffer)
612}
613
614fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
615 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
616 return Ok(false);
617 }
618
619 if path
620 .file_name()
621 .and_then(|name| name.to_str())
622 .is_some_and(|name| name.ends_with("_test.go"))
623 {
624 return Ok(true);
625 }
626
627 let content = fs::read_to_string(path)?;
628 Ok(content.lines().take(10).any(|line| {
629 let trimmed = line.trim();
630 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
631 && trimmed.split_whitespace().any(|token| token == "test")
632 }))
633}
634
635fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
636 let prefix_len = buffer.len().min(8192);
637 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
638 let trimmed_lines: Vec<&str> = prefix
639 .lines()
640 .map(str::trim)
641 .filter(|line| !line.is_empty())
642 .take(64)
643 .collect();
644
645 PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
646 trimmed_lines.iter().any(|line| line == begin)
647 && trimmed_lines.iter().any(|line| line == end)
648 })
649}
650
651fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
652 let name = path
653 .file_name()
654 .unwrap_or_default()
655 .to_string_lossy()
656 .to_string();
657 let base_name = name.clone(); FileInfo {
660 name,
661 base_name,
662 extension: "".to_string(),
663 path: path.to_string_lossy().to_string(),
664 file_type: FileType::Directory,
665 mime_type: None,
666 size: 0,
667 date: get_creation_date(metadata),
668 sha1: None,
669 md5: None,
670 sha256: None,
671 programming_language: None,
672 package_data: Vec::new(), license_expression: None,
674 copyrights: Vec::new(), holders: Vec::new(), authors: Vec::new(), emails: Vec::new(), license_detections: Vec::new(), urls: Vec::new(), for_packages: Vec::new(),
681 scan_errors: Vec::new(),
682 is_source: None,
683 source_count: None,
684 is_legal: false,
685 is_manifest: false,
686 is_readme: false,
687 is_top_level: false,
688 is_key_file: false,
689 is_community: false,
690 is_generated: None,
691 facets: vec![],
692 tallies: None,
693 }
694}
695
696#[cfg(test)]
697mod tests {
698 use super::is_go_non_production_source;
699 use std::fs;
700 use tempfile::tempdir;
701
702 #[test]
703 fn test_is_go_non_production_source_for_test_filename() {
704 let temp_dir = tempdir().unwrap();
705 let path = temp_dir.path().join("scanner_test.go");
706 fs::write(&path, "package scanner\n").unwrap();
707
708 assert!(is_go_non_production_source(&path).unwrap());
709 }
710
711 #[test]
712 fn test_is_go_non_production_source_for_build_tag() {
713 let temp_dir = tempdir().unwrap();
714 let path = temp_dir.path().join("scanner.go");
715 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
716
717 assert!(is_go_non_production_source(&path).unwrap());
718 }
719
720 #[test]
721 fn test_is_go_non_production_source_for_regular_go_file() {
722 let temp_dir = tempdir().unwrap();
723 let path = temp_dir.path().join("scanner.go");
724 fs::write(&path, "package scanner\n").unwrap();
725
726 assert!(!is_go_non_production_source(&path).unwrap());
727 }
728}