1use std::fs;
2use std::path::Path;
3use std::sync::Arc;
4use std::time::{Duration, Instant};
5
6use anyhow::Error;
7use glob::Pattern;
8use log::warn;
9use mime_guess::from_path;
10use rayon::prelude::*;
11
12use crate::askalono::{ScanStrategy, TextData};
13use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
14use crate::copyright::{
15 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
16};
17use crate::finder::{self, DetectionConfig};
18use crate::models::{
19 Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
20 OutputEmail, OutputURL,
21};
22use crate::parsers::try_parse_file;
23use crate::progress::ScanProgress;
24use crate::scanner::{ProcessResult, TextDetectionOptions};
25use crate::utils::file::{
26 ExtractedTextKind, extract_text_for_detection, get_creation_date, is_path_excluded,
27};
28use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
29use crate::utils::language::detect_language;
30
31const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
32 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
33 (
34 "-----BEGIN TRUSTED CERTIFICATE-----",
35 "-----END TRUSTED CERTIFICATE-----",
36 ),
37];
38
39pub fn process<P: AsRef<Path>>(
44 path: P,
45 max_depth: usize,
46 progress: Arc<ScanProgress>,
47 exclude_patterns: &[Pattern],
48 scan_strategy: &ScanStrategy,
49) -> Result<ProcessResult, Error> {
50 process_with_options(
51 path,
52 max_depth,
53 progress,
54 exclude_patterns,
55 scan_strategy,
56 &TextDetectionOptions::default(),
57 )
58}
59
60pub fn process_with_options<P: AsRef<Path>>(
61 path: P,
62 max_depth: usize,
63 progress: Arc<ScanProgress>,
64 exclude_patterns: &[Pattern],
65 scan_strategy: &ScanStrategy,
66 text_options: &TextDetectionOptions,
67) -> Result<ProcessResult, Error> {
68 let depth_limit = depth_limit_from_cli(max_depth);
69 process_with_options_internal(
70 path.as_ref(),
71 depth_limit,
72 progress,
73 exclude_patterns,
74 scan_strategy,
75 text_options,
76 )
77}
78
79fn depth_limit_from_cli(max_depth: usize) -> Option<usize> {
80 if max_depth == 0 {
81 None
82 } else {
83 Some(max_depth)
84 }
85}
86
87fn process_with_options_internal(
88 path: &Path,
89 depth_limit: Option<usize>,
90 progress: Arc<ScanProgress>,
91 exclude_patterns: &[Pattern],
92 scan_strategy: &ScanStrategy,
93 text_options: &TextDetectionOptions,
94) -> Result<ProcessResult, Error> {
95 if is_path_excluded(path, exclude_patterns) {
96 return Ok(ProcessResult {
97 files: Vec::new(),
98 excluded_count: 1,
99 });
100 }
101
102 let mut all_files = Vec::new();
103 let mut total_excluded = 0;
104
105 let entries: Vec<_> = fs::read_dir(path)?.filter_map(Result::ok).collect();
107
108 let mut file_entries = Vec::new();
109 let mut dir_entries = Vec::new();
110
111 for entry in entries {
112 let path = entry.path();
113
114 if is_path_excluded(&path, exclude_patterns) {
116 total_excluded += 1;
117 continue;
118 }
119
120 match fs::metadata(&path) {
121 Ok(metadata) if metadata.is_file() => file_entries.push((path, metadata)),
122 Ok(metadata) if path.is_dir() => dir_entries.push((path, metadata)),
123 _ => continue,
124 }
125 }
126
127 all_files.append(
129 &mut file_entries
130 .par_iter()
131 .map(|(path, metadata)| {
132 let file_entry = process_file(path, metadata, scan_strategy, text_options);
133 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
134 file_entry
135 })
136 .collect(),
137 );
138
139 for (path, metadata) in dir_entries {
141 all_files.push(process_directory(&path, &metadata));
142
143 let should_recurse = match depth_limit {
144 None => true,
145 Some(remaining_depth) => remaining_depth > 0,
146 };
147
148 if should_recurse {
149 let next_depth_limit = depth_limit.map(|remaining_depth| remaining_depth - 1);
150 match process_with_options_internal(
151 &path,
152 next_depth_limit,
153 progress.clone(),
154 exclude_patterns,
155 scan_strategy,
156 text_options,
157 ) {
158 Ok(mut result) => {
159 all_files.append(&mut result.files);
160 total_excluded += result.excluded_count;
161 }
162 Err(e) => progress.record_runtime_error(&path, &e.to_string()),
163 }
164 }
165 }
166
167 Ok(ProcessResult {
168 files: all_files,
169 excluded_count: total_excluded,
170 })
171}
172
173fn process_file(
174 path: &Path,
175 metadata: &fs::Metadata,
176 scan_strategy: &ScanStrategy,
177 text_options: &TextDetectionOptions,
178) -> FileInfo {
179 let mut scan_errors: Vec<String> = vec![];
180 let mut file_info_builder = FileInfoBuilder::default();
181
182 let started = Instant::now();
183
184 if let Err(e) =
185 extract_information_from_content(&mut file_info_builder, path, scan_strategy, text_options)
186 {
187 scan_errors.push(e.to_string());
188 };
189
190 if is_timeout_exceeded(started, text_options.timeout_seconds) {
191 scan_errors.push(format!(
192 "Processing interrupted due to timeout after {:.2} seconds",
193 text_options.timeout_seconds
194 ));
195 }
196
197 let mut file_info = file_info_builder
198 .name(path.file_name().unwrap().to_string_lossy().to_string())
199 .base_name(
200 path.file_stem()
201 .unwrap_or_default()
202 .to_string_lossy()
203 .to_string(),
204 )
205 .extension(
206 path.extension()
207 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
208 )
209 .path(path.to_string_lossy().to_string())
210 .file_type(FileType::File)
211 .mime_type(Some(
212 from_path(path)
213 .first_or_octet_stream()
214 .essence_str()
215 .to_string(),
216 ))
217 .size(metadata.len())
218 .date(get_creation_date(metadata))
219 .scan_errors(scan_errors)
220 .build()
221 .expect("FileInformationBuild not completely initialized");
222
223 if file_info.programming_language.as_deref() == Some("Go")
224 && is_go_non_production_source(path).unwrap_or(false)
225 {
226 file_info.is_source = Some(false);
227 }
228
229 if let (Some(scan_results_dir), Some(sha256)) = (
230 text_options.scan_cache_dir.as_deref(),
231 file_info.sha256.as_deref(),
232 ) && file_info.scan_errors.is_empty()
233 {
234 let findings = CachedScanFindings::from_file_info(&file_info);
235 let options_fingerprint = scan_cache_fingerprint(text_options);
236 if let Err(err) =
237 write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
238 {
239 file_info
240 .scan_errors
241 .push(format!("Failed to write scan cache entry: {err}"));
242 }
243 }
244
245 file_info
246}
247
248fn extract_information_from_content(
249 file_info_builder: &mut FileInfoBuilder,
250 path: &Path,
251 scan_strategy: &ScanStrategy,
252 text_options: &TextDetectionOptions,
253) -> Result<(), Error> {
254 let started = Instant::now();
255 let buffer = fs::read(path)?;
256
257 if is_timeout_exceeded(started, text_options.timeout_seconds) {
258 return Err(Error::msg(format!(
259 "Timeout while reading file content (> {:.2}s)",
260 text_options.timeout_seconds
261 )));
262 }
263
264 let sha256 = calculate_sha256(&buffer);
265
266 file_info_builder
267 .sha1(Some(calculate_sha1(&buffer)))
268 .md5(Some(calculate_md5(&buffer)))
269 .sha256(Some(sha256.clone()))
270 .programming_language(Some(detect_language(path, &buffer)));
271
272 if should_skip_text_detection(path, &buffer) {
273 return Ok(());
274 }
275
276 if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
277 let options_fingerprint = scan_cache_fingerprint(text_options);
278 match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
279 Ok(Some(findings)) => {
280 file_info_builder
281 .package_data(findings.package_data)
282 .license_expression(findings.license_expression)
283 .license_detections(findings.license_detections)
284 .copyrights(findings.copyrights)
285 .holders(findings.holders)
286 .authors(findings.authors)
287 .emails(findings.emails)
288 .urls(findings.urls)
289 .programming_language(findings.programming_language);
290 return Ok(());
291 }
292 Ok(None) => {}
293 Err(err) => {
294 warn!("Failed to read scan cache for {:?}: {}", path, err);
295 }
296 }
297 }
298
299 if let Some(package_data) = try_parse_file(path) {
302 file_info_builder.package_data(package_data);
303 }
304
305 if is_timeout_exceeded(started, text_options.timeout_seconds) {
306 return Err(Error::msg(format!(
307 "Timeout while extracting package/text metadata (> {:.2}s)",
308 text_options.timeout_seconds
309 )));
310 }
311
312 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
313 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
314
315 if is_timeout_exceeded(started, text_options.timeout_seconds) {
316 return Err(Error::msg(format!(
317 "Timeout while extracting text content (> {:.2}s)",
318 text_options.timeout_seconds
319 )));
320 }
321
322 if text_content.is_empty() {
323 return Ok(());
324 }
325
326 if text_options.detect_copyrights {
327 extract_copyright_information(
328 file_info_builder,
329 path,
330 &text_content,
331 text_options.timeout_seconds,
332 from_binary_strings,
333 );
334 }
335 extract_email_url_information(file_info_builder, &text_content, text_options);
336
337 if is_timeout_exceeded(started, text_options.timeout_seconds) {
338 return Err(Error::msg(format!(
339 "Timeout before license scan (> {:.2}s)",
340 text_options.timeout_seconds
341 )));
342 }
343
344 extract_license_information(file_info_builder, text_content, scan_strategy)
345}
346
347fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
348 timeout_seconds.is_finite()
349 && timeout_seconds > 0.0
350 && started.elapsed().as_secs_f64() > timeout_seconds
351}
352
353fn scan_cache_fingerprint(text_options: &TextDetectionOptions) -> String {
354 format!(
355 "copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6}",
356 text_options.detect_copyrights,
357 text_options.detect_emails,
358 text_options.detect_urls,
359 text_options.max_emails,
360 text_options.max_urls,
361 text_options.timeout_seconds,
362 )
363}
364
365fn extract_copyright_information(
366 file_info_builder: &mut FileInfoBuilder,
367 path: &Path,
368 text_content: &str,
369 timeout_seconds: f64,
370 from_binary_strings: bool,
371) {
372 if copyright::is_credits_file(path) {
374 let author_detections = copyright::detect_credits_authors(text_content);
375 if !author_detections.is_empty() {
376 file_info_builder.authors(
377 author_detections
378 .into_iter()
379 .map(|a| Author {
380 author: a.author,
381 start_line: a.start_line,
382 end_line: a.end_line,
383 })
384 .collect(),
385 );
386 return;
387 }
388 }
389
390 let copyright_options = CopyrightDetectionOptions {
391 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
392 Some(Duration::from_secs_f64(timeout_seconds))
393 } else {
394 None
395 },
396 ..CopyrightDetectionOptions::default()
397 };
398
399 let (copyrights, holders, authors) =
400 copyright::detect_copyrights_with_options(text_content, ©right_options);
401 let (copyrights, holders, authors) = if from_binary_strings {
402 prune_binary_string_detections(copyrights, holders, authors)
403 } else {
404 (copyrights, holders, authors)
405 };
406
407 file_info_builder.copyrights(
408 copyrights
409 .into_iter()
410 .map(|c| Copyright {
411 copyright: c.copyright,
412 start_line: c.start_line,
413 end_line: c.end_line,
414 })
415 .collect::<Vec<Copyright>>(),
416 );
417 file_info_builder.holders(
418 holders
419 .into_iter()
420 .map(|h| Holder {
421 holder: h.holder,
422 start_line: h.start_line,
423 end_line: h.end_line,
424 })
425 .collect::<Vec<Holder>>(),
426 );
427 file_info_builder.authors(
428 authors
429 .into_iter()
430 .map(|a| Author {
431 author: a.author,
432 start_line: a.start_line,
433 end_line: a.end_line,
434 })
435 .collect::<Vec<Author>>(),
436 );
437}
438
439fn prune_binary_string_detections(
440 copyrights: Vec<CopyrightDetection>,
441 holders: Vec<HolderDetection>,
442 _authors: Vec<AuthorDetection>,
443) -> (
444 Vec<CopyrightDetection>,
445 Vec<HolderDetection>,
446 Vec<AuthorDetection>,
447) {
448 let kept_copyrights: Vec<CopyrightDetection> = copyrights
449 .into_iter()
450 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
451 .collect();
452
453 let kept_holders: Vec<HolderDetection> = holders
454 .into_iter()
455 .filter(|holder| {
456 kept_copyrights.iter().any(|copyright| {
457 ranges_overlap(
458 holder.start_line,
459 holder.end_line,
460 copyright.start_line,
461 copyright.end_line,
462 )
463 })
464 })
465 .collect();
466
467 (kept_copyrights, kept_holders, Vec::new())
468}
469
470fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
471 a_start <= b_end && b_start <= a_end
472}
473
474fn is_binary_string_copyright_candidate(text: &str) -> bool {
475 if has_explicit_copyright_marker(text) || contains_year(text) {
476 return true;
477 }
478
479 let lower = text.to_ascii_lowercase();
480 let Some(tail) = lower.strip_prefix("copyright") else {
481 return true;
482 };
483 let tail = tail.trim();
484 let alpha_tokens: Vec<&str> = tail
485 .split_whitespace()
486 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
487 .collect();
488
489 if alpha_tokens.len() <= 1 {
490 return true;
491 }
492
493 if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
494 return true;
495 }
496
497 alpha_tokens
498 .iter()
499 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
500}
501
502fn has_explicit_copyright_marker(text: &str) -> bool {
503 let lower = text.to_ascii_lowercase();
504 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
505}
506
507fn contains_year(text: &str) -> bool {
508 let bytes = text.as_bytes();
509 bytes.windows(4).any(|window| {
510 window.iter().all(|b| b.is_ascii_digit())
511 && matches!(window[0], b'1' | b'2')
512 && matches!(window[1], b'9' | b'0')
513 })
514}
515
516fn is_company_like_suffix(token: &str) -> bool {
517 matches!(
518 token.to_ascii_lowercase().as_str(),
519 "inc"
520 | "corp"
521 | "corporation"
522 | "co"
523 | "company"
524 | "ltd"
525 | "llc"
526 | "gmbh"
527 | "foundation"
528 | "project"
529 | "systems"
530 | "software"
531 | "technologies"
532 | "technology"
533 )
534}
535
536fn extract_email_url_information(
537 file_info_builder: &mut FileInfoBuilder,
538 text_content: &str,
539 text_options: &TextDetectionOptions,
540) {
541 if !text_options.detect_emails && !text_options.detect_urls {
542 return;
543 }
544
545 if text_options.detect_emails {
546 let config = DetectionConfig {
547 max_emails: text_options.max_emails,
548 max_urls: text_options.max_urls,
549 unique: false,
550 };
551 let emails = finder::find_emails(text_content, &config)
552 .into_iter()
553 .map(|d| OutputEmail {
554 email: d.email,
555 start_line: d.start_line,
556 end_line: d.end_line,
557 })
558 .collect::<Vec<_>>();
559 file_info_builder.emails(emails);
560 }
561
562 if text_options.detect_urls {
563 let config = DetectionConfig {
564 max_emails: text_options.max_emails,
565 max_urls: text_options.max_urls,
566 unique: true,
567 };
568 let urls = finder::find_urls(text_content, &config)
569 .into_iter()
570 .map(|d| OutputURL {
571 url: d.url,
572 start_line: d.start_line,
573 end_line: d.end_line,
574 })
575 .collect::<Vec<_>>();
576 file_info_builder.urls(urls);
577 }
578}
579
580fn extract_license_information(
581 file_info_builder: &mut FileInfoBuilder,
582 text_content: String,
583 scan_strategy: &ScanStrategy,
584) -> Result<(), Error> {
585 if text_content.is_empty() || !scan_strategy.store_has_licenses() {
586 return Ok(());
587 }
588
589 let license_result = scan_strategy.scan(&TextData::from(text_content.as_str()))?;
590 let license_expr = license_result.license.map(|x| x.name.to_string());
591
592 let license_detections = license_result
593 .containing
594 .iter()
595 .map(|detection| {
596 let license_lower = detection.license.name.to_lowercase();
597 LicenseDetection {
598 license_expression: license_lower.clone(),
599 license_expression_spdx: detection.license.name.to_string(),
600 matches: vec![Match {
601 license_expression: license_lower.clone(),
602 license_expression_spdx: detection.license.name.to_string(),
603 from_file: None,
604 score: detection.score as f64,
605 start_line: detection.line_range.0,
606 end_line: detection.line_range.1,
607 matcher: Some("2-aho".to_string()),
608 matched_length: None,
609 match_coverage: None,
610 rule_relevance: None,
611 rule_identifier: None,
612 rule_url: None,
613 matched_text: None,
614 }],
615 identifier: None,
616 }
617 })
618 .collect::<Vec<_>>();
619
620 file_info_builder
621 .license_expression(license_expr)
622 .license_detections(license_detections);
623
624 Ok(())
625}
626
627fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
628 is_pem_certificate_file(path, buffer)
629}
630
631fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
632 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
633 return Ok(false);
634 }
635
636 if path
637 .file_name()
638 .and_then(|name| name.to_str())
639 .is_some_and(|name| name.ends_with("_test.go"))
640 {
641 return Ok(true);
642 }
643
644 let content = fs::read_to_string(path)?;
645 Ok(content.lines().take(10).any(|line| {
646 let trimmed = line.trim();
647 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
648 && trimmed.split_whitespace().any(|token| token == "test")
649 }))
650}
651
652fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
653 let prefix_len = buffer.len().min(8192);
654 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
655 let trimmed_lines: Vec<&str> = prefix
656 .lines()
657 .map(str::trim)
658 .filter(|line| !line.is_empty())
659 .take(64)
660 .collect();
661
662 PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
663 trimmed_lines.iter().any(|line| line == begin)
664 && trimmed_lines.iter().any(|line| line == end)
665 })
666}
667
668fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
669 let name = path
670 .file_name()
671 .unwrap_or_default()
672 .to_string_lossy()
673 .to_string();
674 let base_name = name.clone(); FileInfo {
677 name,
678 base_name,
679 extension: "".to_string(),
680 path: path.to_string_lossy().to_string(),
681 file_type: FileType::Directory,
682 mime_type: None,
683 size: 0,
684 date: get_creation_date(metadata),
685 sha1: None,
686 md5: None,
687 sha256: None,
688 programming_language: None,
689 package_data: Vec::new(), license_expression: None,
691 copyrights: Vec::new(), holders: Vec::new(), authors: Vec::new(), emails: Vec::new(), license_detections: Vec::new(), urls: Vec::new(), for_packages: Vec::new(),
698 scan_errors: Vec::new(),
699 is_source: None,
700 source_count: None,
701 is_legal: false,
702 is_manifest: false,
703 is_readme: false,
704 is_top_level: false,
705 is_key_file: false,
706 }
707}
708
709#[cfg(test)]
710mod tests {
711 use super::is_go_non_production_source;
712 use std::fs;
713 use tempfile::tempdir;
714
715 #[test]
716 fn test_is_go_non_production_source_for_test_filename() {
717 let temp_dir = tempdir().unwrap();
718 let path = temp_dir.path().join("scanner_test.go");
719 fs::write(&path, "package scanner\n").unwrap();
720
721 assert!(is_go_non_production_source(&path).unwrap());
722 }
723
724 #[test]
725 fn test_is_go_non_production_source_for_build_tag() {
726 let temp_dir = tempdir().unwrap();
727 let path = temp_dir.path().join("scanner.go");
728 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
729
730 assert!(is_go_non_production_source(&path).unwrap());
731 }
732
733 #[test]
734 fn test_is_go_non_production_source_for_regular_go_file() {
735 let temp_dir = tempdir().unwrap();
736 let path = temp_dir.path().join("scanner.go");
737 fs::write(&path, "package scanner\n").unwrap();
738
739 assert!(!is_go_non_production_source(&path).unwrap());
740 }
741}