Skip to main content

provenant/scanner/
process.rs

1use std::fs;
2use std::path::Path;
3use std::sync::Arc;
4use std::time::{Duration, Instant};
5
6use anyhow::Error;
7use glob::Pattern;
8use log::warn;
9use mime_guess::from_path;
10use rayon::prelude::*;
11
12use crate::askalono::{ScanStrategy, TextData};
13use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
14use crate::copyright::{
15    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
16};
17use crate::finder::{self, DetectionConfig};
18use crate::models::{
19    Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
20    OutputEmail, OutputURL,
21};
22use crate::parsers::try_parse_file;
23use crate::progress::ScanProgress;
24use crate::scanner::{ProcessResult, TextDetectionOptions};
25use crate::utils::file::{
26    ExtractedTextKind, extract_text_for_detection, get_creation_date, is_path_excluded,
27};
28use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
29use crate::utils::language::detect_language;
30
31const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
32    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
33    (
34        "-----BEGIN TRUSTED CERTIFICATE-----",
35        "-----END TRUSTED CERTIFICATE-----",
36    ),
37];
38
39/// Scan a directory tree and produce [`ProcessResult`] entries.
40///
41/// This traverses files/directories up to `max_depth`, applies exclusion
42/// patterns, extracts metadata, and performs license/copyright parsing.
43pub fn process<P: AsRef<Path>>(
44    path: P,
45    max_depth: usize,
46    progress: Arc<ScanProgress>,
47    exclude_patterns: &[Pattern],
48    scan_strategy: &ScanStrategy,
49) -> Result<ProcessResult, Error> {
50    process_with_options(
51        path,
52        max_depth,
53        progress,
54        exclude_patterns,
55        scan_strategy,
56        &TextDetectionOptions::default(),
57    )
58}
59
60pub fn process_with_options<P: AsRef<Path>>(
61    path: P,
62    max_depth: usize,
63    progress: Arc<ScanProgress>,
64    exclude_patterns: &[Pattern],
65    scan_strategy: &ScanStrategy,
66    text_options: &TextDetectionOptions,
67) -> Result<ProcessResult, Error> {
68    let depth_limit = depth_limit_from_cli(max_depth);
69    process_with_options_internal(
70        path.as_ref(),
71        depth_limit,
72        progress,
73        exclude_patterns,
74        scan_strategy,
75        text_options,
76    )
77}
78
79fn depth_limit_from_cli(max_depth: usize) -> Option<usize> {
80    if max_depth == 0 {
81        None
82    } else {
83        Some(max_depth)
84    }
85}
86
87fn process_with_options_internal(
88    path: &Path,
89    depth_limit: Option<usize>,
90    progress: Arc<ScanProgress>,
91    exclude_patterns: &[Pattern],
92    scan_strategy: &ScanStrategy,
93    text_options: &TextDetectionOptions,
94) -> Result<ProcessResult, Error> {
95    if is_path_excluded(path, exclude_patterns) {
96        return Ok(ProcessResult {
97            files: Vec::new(),
98            excluded_count: 1,
99        });
100    }
101
102    let mut all_files = Vec::new();
103    let mut total_excluded = 0;
104
105    // Read directory entries and group by exclusion status and type
106    let entries: Vec<_> = fs::read_dir(path)?.filter_map(Result::ok).collect();
107
108    let mut file_entries = Vec::new();
109    let mut dir_entries = Vec::new();
110
111    for entry in entries {
112        let path = entry.path();
113
114        // Check exclusion only once per path
115        if is_path_excluded(&path, exclude_patterns) {
116            total_excluded += 1;
117            continue;
118        }
119
120        match fs::metadata(&path) {
121            Ok(metadata) if metadata.is_file() => file_entries.push((path, metadata)),
122            Ok(metadata) if path.is_dir() => dir_entries.push((path, metadata)),
123            _ => continue,
124        }
125    }
126
127    // Process files in parallel
128    all_files.append(
129        &mut file_entries
130            .par_iter()
131            .map(|(path, metadata)| {
132                let file_entry = process_file(path, metadata, scan_strategy, text_options);
133                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
134                file_entry
135            })
136            .collect(),
137    );
138
139    // Process directories
140    for (path, metadata) in dir_entries {
141        all_files.push(process_directory(&path, &metadata));
142
143        let should_recurse = match depth_limit {
144            None => true,
145            Some(remaining_depth) => remaining_depth > 0,
146        };
147
148        if should_recurse {
149            let next_depth_limit = depth_limit.map(|remaining_depth| remaining_depth - 1);
150            match process_with_options_internal(
151                &path,
152                next_depth_limit,
153                progress.clone(),
154                exclude_patterns,
155                scan_strategy,
156                text_options,
157            ) {
158                Ok(mut result) => {
159                    all_files.append(&mut result.files);
160                    total_excluded += result.excluded_count;
161                }
162                Err(e) => progress.record_runtime_error(&path, &e.to_string()),
163            }
164        }
165    }
166
167    Ok(ProcessResult {
168        files: all_files,
169        excluded_count: total_excluded,
170    })
171}
172
173fn process_file(
174    path: &Path,
175    metadata: &fs::Metadata,
176    scan_strategy: &ScanStrategy,
177    text_options: &TextDetectionOptions,
178) -> FileInfo {
179    let mut scan_errors: Vec<String> = vec![];
180    let mut file_info_builder = FileInfoBuilder::default();
181
182    let started = Instant::now();
183
184    if let Err(e) =
185        extract_information_from_content(&mut file_info_builder, path, scan_strategy, text_options)
186    {
187        scan_errors.push(e.to_string());
188    };
189
190    if is_timeout_exceeded(started, text_options.timeout_seconds) {
191        scan_errors.push(format!(
192            "Processing interrupted due to timeout after {:.2} seconds",
193            text_options.timeout_seconds
194        ));
195    }
196
197    let mut file_info = file_info_builder
198        .name(path.file_name().unwrap().to_string_lossy().to_string())
199        .base_name(
200            path.file_stem()
201                .unwrap_or_default()
202                .to_string_lossy()
203                .to_string(),
204        )
205        .extension(
206            path.extension()
207                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
208        )
209        .path(path.to_string_lossy().to_string())
210        .file_type(FileType::File)
211        .mime_type(Some(
212            from_path(path)
213                .first_or_octet_stream()
214                .essence_str()
215                .to_string(),
216        ))
217        .size(metadata.len())
218        .date(get_creation_date(metadata))
219        .scan_errors(scan_errors)
220        .build()
221        .expect("FileInformationBuild not completely initialized");
222
223    if file_info.programming_language.as_deref() == Some("Go")
224        && is_go_non_production_source(path).unwrap_or(false)
225    {
226        file_info.is_source = Some(false);
227    }
228
229    if let (Some(scan_results_dir), Some(sha256)) = (
230        text_options.scan_cache_dir.as_deref(),
231        file_info.sha256.as_deref(),
232    ) && file_info.scan_errors.is_empty()
233    {
234        let findings = CachedScanFindings::from_file_info(&file_info);
235        let options_fingerprint = scan_cache_fingerprint(text_options);
236        if let Err(err) =
237            write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
238        {
239            file_info
240                .scan_errors
241                .push(format!("Failed to write scan cache entry: {err}"));
242        }
243    }
244
245    file_info
246}
247
248fn extract_information_from_content(
249    file_info_builder: &mut FileInfoBuilder,
250    path: &Path,
251    scan_strategy: &ScanStrategy,
252    text_options: &TextDetectionOptions,
253) -> Result<(), Error> {
254    let started = Instant::now();
255    let buffer = fs::read(path)?;
256
257    if is_timeout_exceeded(started, text_options.timeout_seconds) {
258        return Err(Error::msg(format!(
259            "Timeout while reading file content (> {:.2}s)",
260            text_options.timeout_seconds
261        )));
262    }
263
264    let sha256 = calculate_sha256(&buffer);
265
266    file_info_builder
267        .sha1(Some(calculate_sha1(&buffer)))
268        .md5(Some(calculate_md5(&buffer)))
269        .sha256(Some(sha256.clone()))
270        .programming_language(Some(detect_language(path, &buffer)));
271
272    if should_skip_text_detection(path, &buffer) {
273        return Ok(());
274    }
275
276    if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
277        let options_fingerprint = scan_cache_fingerprint(text_options);
278        match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
279            Ok(Some(findings)) => {
280                file_info_builder
281                    .package_data(findings.package_data)
282                    .license_expression(findings.license_expression)
283                    .license_detections(findings.license_detections)
284                    .copyrights(findings.copyrights)
285                    .holders(findings.holders)
286                    .authors(findings.authors)
287                    .emails(findings.emails)
288                    .urls(findings.urls)
289                    .programming_language(findings.programming_language);
290                return Ok(());
291            }
292            Ok(None) => {}
293            Err(err) => {
294                warn!("Failed to read scan cache for {:?}: {}", path, err);
295            }
296        }
297    }
298
299    // Package parsing and text-based detection (copyright, license) are independent.
300    // Python ScanCode runs all enabled plugins on every file, so we do the same.
301    if let Some(package_data) = try_parse_file(path) {
302        file_info_builder.package_data(package_data);
303    }
304
305    if is_timeout_exceeded(started, text_options.timeout_seconds) {
306        return Err(Error::msg(format!(
307            "Timeout while extracting package/text metadata (> {:.2}s)",
308            text_options.timeout_seconds
309        )));
310    }
311
312    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
313    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
314
315    if is_timeout_exceeded(started, text_options.timeout_seconds) {
316        return Err(Error::msg(format!(
317            "Timeout while extracting text content (> {:.2}s)",
318            text_options.timeout_seconds
319        )));
320    }
321
322    if text_content.is_empty() {
323        return Ok(());
324    }
325
326    if text_options.detect_copyrights {
327        extract_copyright_information(
328            file_info_builder,
329            path,
330            &text_content,
331            text_options.timeout_seconds,
332            from_binary_strings,
333        );
334    }
335    extract_email_url_information(file_info_builder, &text_content, text_options);
336
337    if is_timeout_exceeded(started, text_options.timeout_seconds) {
338        return Err(Error::msg(format!(
339            "Timeout before license scan (> {:.2}s)",
340            text_options.timeout_seconds
341        )));
342    }
343
344    extract_license_information(file_info_builder, text_content, scan_strategy)
345}
346
347fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
348    timeout_seconds.is_finite()
349        && timeout_seconds > 0.0
350        && started.elapsed().as_secs_f64() > timeout_seconds
351}
352
353fn scan_cache_fingerprint(text_options: &TextDetectionOptions) -> String {
354    format!(
355        "copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6}",
356        text_options.detect_copyrights,
357        text_options.detect_emails,
358        text_options.detect_urls,
359        text_options.max_emails,
360        text_options.max_urls,
361        text_options.timeout_seconds,
362    )
363}
364
365fn extract_copyright_information(
366    file_info_builder: &mut FileInfoBuilder,
367    path: &Path,
368    text_content: &str,
369    timeout_seconds: f64,
370    from_binary_strings: bool,
371) {
372    // CREDITS files get special handling (Linux kernel style).
373    if copyright::is_credits_file(path) {
374        let author_detections = copyright::detect_credits_authors(text_content);
375        if !author_detections.is_empty() {
376            file_info_builder.authors(
377                author_detections
378                    .into_iter()
379                    .map(|a| Author {
380                        author: a.author,
381                        start_line: a.start_line,
382                        end_line: a.end_line,
383                    })
384                    .collect(),
385            );
386            return;
387        }
388    }
389
390    let copyright_options = CopyrightDetectionOptions {
391        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
392            Some(Duration::from_secs_f64(timeout_seconds))
393        } else {
394            None
395        },
396        ..CopyrightDetectionOptions::default()
397    };
398
399    let (copyrights, holders, authors) =
400        copyright::detect_copyrights_with_options(text_content, &copyright_options);
401    let (copyrights, holders, authors) = if from_binary_strings {
402        prune_binary_string_detections(copyrights, holders, authors)
403    } else {
404        (copyrights, holders, authors)
405    };
406
407    file_info_builder.copyrights(
408        copyrights
409            .into_iter()
410            .map(|c| Copyright {
411                copyright: c.copyright,
412                start_line: c.start_line,
413                end_line: c.end_line,
414            })
415            .collect::<Vec<Copyright>>(),
416    );
417    file_info_builder.holders(
418        holders
419            .into_iter()
420            .map(|h| Holder {
421                holder: h.holder,
422                start_line: h.start_line,
423                end_line: h.end_line,
424            })
425            .collect::<Vec<Holder>>(),
426    );
427    file_info_builder.authors(
428        authors
429            .into_iter()
430            .map(|a| Author {
431                author: a.author,
432                start_line: a.start_line,
433                end_line: a.end_line,
434            })
435            .collect::<Vec<Author>>(),
436    );
437}
438
439fn prune_binary_string_detections(
440    copyrights: Vec<CopyrightDetection>,
441    holders: Vec<HolderDetection>,
442    _authors: Vec<AuthorDetection>,
443) -> (
444    Vec<CopyrightDetection>,
445    Vec<HolderDetection>,
446    Vec<AuthorDetection>,
447) {
448    let kept_copyrights: Vec<CopyrightDetection> = copyrights
449        .into_iter()
450        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
451        .collect();
452
453    let kept_holders: Vec<HolderDetection> = holders
454        .into_iter()
455        .filter(|holder| {
456            kept_copyrights.iter().any(|copyright| {
457                ranges_overlap(
458                    holder.start_line,
459                    holder.end_line,
460                    copyright.start_line,
461                    copyright.end_line,
462                )
463            })
464        })
465        .collect();
466
467    (kept_copyrights, kept_holders, Vec::new())
468}
469
470fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
471    a_start <= b_end && b_start <= a_end
472}
473
474fn is_binary_string_copyright_candidate(text: &str) -> bool {
475    if has_explicit_copyright_marker(text) || contains_year(text) {
476        return true;
477    }
478
479    let lower = text.to_ascii_lowercase();
480    let Some(tail) = lower.strip_prefix("copyright") else {
481        return true;
482    };
483    let tail = tail.trim();
484    let alpha_tokens: Vec<&str> = tail
485        .split_whitespace()
486        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
487        .collect();
488
489    if alpha_tokens.len() <= 1 {
490        return true;
491    }
492
493    if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
494        return true;
495    }
496
497    alpha_tokens
498        .iter()
499        .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
500}
501
502fn has_explicit_copyright_marker(text: &str) -> bool {
503    let lower = text.to_ascii_lowercase();
504    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
505}
506
507fn contains_year(text: &str) -> bool {
508    let bytes = text.as_bytes();
509    bytes.windows(4).any(|window| {
510        window.iter().all(|b| b.is_ascii_digit())
511            && matches!(window[0], b'1' | b'2')
512            && matches!(window[1], b'9' | b'0')
513    })
514}
515
516fn is_company_like_suffix(token: &str) -> bool {
517    matches!(
518        token.to_ascii_lowercase().as_str(),
519        "inc"
520            | "corp"
521            | "corporation"
522            | "co"
523            | "company"
524            | "ltd"
525            | "llc"
526            | "gmbh"
527            | "foundation"
528            | "project"
529            | "systems"
530            | "software"
531            | "technologies"
532            | "technology"
533    )
534}
535
536fn extract_email_url_information(
537    file_info_builder: &mut FileInfoBuilder,
538    text_content: &str,
539    text_options: &TextDetectionOptions,
540) {
541    if !text_options.detect_emails && !text_options.detect_urls {
542        return;
543    }
544
545    if text_options.detect_emails {
546        let config = DetectionConfig {
547            max_emails: text_options.max_emails,
548            max_urls: text_options.max_urls,
549            unique: false,
550        };
551        let emails = finder::find_emails(text_content, &config)
552            .into_iter()
553            .map(|d| OutputEmail {
554                email: d.email,
555                start_line: d.start_line,
556                end_line: d.end_line,
557            })
558            .collect::<Vec<_>>();
559        file_info_builder.emails(emails);
560    }
561
562    if text_options.detect_urls {
563        let config = DetectionConfig {
564            max_emails: text_options.max_emails,
565            max_urls: text_options.max_urls,
566            unique: true,
567        };
568        let urls = finder::find_urls(text_content, &config)
569            .into_iter()
570            .map(|d| OutputURL {
571                url: d.url,
572                start_line: d.start_line,
573                end_line: d.end_line,
574            })
575            .collect::<Vec<_>>();
576        file_info_builder.urls(urls);
577    }
578}
579
580fn extract_license_information(
581    file_info_builder: &mut FileInfoBuilder,
582    text_content: String,
583    scan_strategy: &ScanStrategy,
584) -> Result<(), Error> {
585    if text_content.is_empty() || !scan_strategy.store_has_licenses() {
586        return Ok(());
587    }
588
589    let license_result = scan_strategy.scan(&TextData::from(text_content.as_str()))?;
590    let license_expr = license_result.license.map(|x| x.name.to_string());
591
592    let license_detections = license_result
593        .containing
594        .iter()
595        .map(|detection| {
596            let license_lower = detection.license.name.to_lowercase();
597            LicenseDetection {
598                license_expression: license_lower.clone(),
599                license_expression_spdx: detection.license.name.to_string(),
600                matches: vec![Match {
601                    license_expression: license_lower.clone(),
602                    license_expression_spdx: detection.license.name.to_string(),
603                    from_file: None,
604                    score: detection.score as f64,
605                    start_line: detection.line_range.0,
606                    end_line: detection.line_range.1,
607                    matcher: Some("2-aho".to_string()),
608                    matched_length: None,
609                    match_coverage: None,
610                    rule_relevance: None,
611                    rule_identifier: None,
612                    rule_url: None,
613                    matched_text: None,
614                }],
615                identifier: None,
616            }
617        })
618        .collect::<Vec<_>>();
619
620    file_info_builder
621        .license_expression(license_expr)
622        .license_detections(license_detections);
623
624    Ok(())
625}
626
627fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
628    is_pem_certificate_file(path, buffer)
629}
630
631fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
632    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
633        return Ok(false);
634    }
635
636    if path
637        .file_name()
638        .and_then(|name| name.to_str())
639        .is_some_and(|name| name.ends_with("_test.go"))
640    {
641        return Ok(true);
642    }
643
644    let content = fs::read_to_string(path)?;
645    Ok(content.lines().take(10).any(|line| {
646        let trimmed = line.trim();
647        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
648            && trimmed.split_whitespace().any(|token| token == "test")
649    }))
650}
651
652fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
653    let prefix_len = buffer.len().min(8192);
654    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
655    let trimmed_lines: Vec<&str> = prefix
656        .lines()
657        .map(str::trim)
658        .filter(|line| !line.is_empty())
659        .take(64)
660        .collect();
661
662    PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
663        trimmed_lines.iter().any(|line| line == begin)
664            && trimmed_lines.iter().any(|line| line == end)
665    })
666}
667
668fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
669    let name = path
670        .file_name()
671        .unwrap_or_default()
672        .to_string_lossy()
673        .to_string();
674    let base_name = name.clone(); // For directories, base_name is the same as name
675
676    FileInfo {
677        name,
678        base_name,
679        extension: "".to_string(),
680        path: path.to_string_lossy().to_string(),
681        file_type: FileType::Directory,
682        mime_type: None,
683        size: 0,
684        date: get_creation_date(metadata),
685        sha1: None,
686        md5: None,
687        sha256: None,
688        programming_language: None,
689        package_data: Vec::new(), // TODO: implement
690        license_expression: None,
691        copyrights: Vec::new(),         // TODO: implement
692        holders: Vec::new(),            // TODO: implement
693        authors: Vec::new(),            // TODO: implement
694        emails: Vec::new(),             // TODO: implement
695        license_detections: Vec::new(), // TODO: implement
696        urls: Vec::new(),               // TODO: implement
697        for_packages: Vec::new(),
698        scan_errors: Vec::new(),
699        is_source: None,
700        source_count: None,
701        is_legal: false,
702        is_manifest: false,
703        is_readme: false,
704        is_top_level: false,
705        is_key_file: false,
706    }
707}
708
709#[cfg(test)]
710mod tests {
711    use super::is_go_non_production_source;
712    use std::fs;
713    use tempfile::tempdir;
714
715    #[test]
716    fn test_is_go_non_production_source_for_test_filename() {
717        let temp_dir = tempdir().unwrap();
718        let path = temp_dir.path().join("scanner_test.go");
719        fs::write(&path, "package scanner\n").unwrap();
720
721        assert!(is_go_non_production_source(&path).unwrap());
722    }
723
724    #[test]
725    fn test_is_go_non_production_source_for_build_tag() {
726        let temp_dir = tempdir().unwrap();
727        let path = temp_dir.path().join("scanner.go");
728        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
729
730        assert!(is_go_non_production_source(&path).unwrap());
731    }
732
733    #[test]
734    fn test_is_go_non_production_source_for_regular_go_file() {
735        let temp_dir = tempdir().unwrap();
736        let path = temp_dir.path().join("scanner.go");
737        fs::write(&path, "package scanner\n").unwrap();
738
739        assert!(!is_go_non_production_source(&path).unwrap());
740    }
741}