Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::try_parse_file;
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
4use crate::utils::language::detect_language;
5use crate::utils::text::{is_source, remove_verbatim_escape_sequences};
6use anyhow::Error;
7use log::warn;
8use mime_guess::from_path;
9use rayon::prelude::*;
10use std::fs::{self};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
16use crate::copyright::{
17    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::models::{
21    Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
22    OutputEmail, OutputURL,
23};
24use crate::progress::ScanProgress;
25use crate::scanner::collect::CollectedPaths;
26use crate::scanner::{ProcessResult, TextDetectionOptions};
27use crate::utils::file::{ExtractedTextKind, extract_text_for_detection, get_creation_date};
28use crate::utils::generated::generated_code_hints_from_bytes;
29
30const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
31    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
32    (
33        "-----BEGIN TRUSTED CERTIFICATE-----",
34        "-----END TRUSTED CERTIFICATE-----",
35    ),
36];
37
38pub fn process_collected(
39    collected: &CollectedPaths,
40    progress: Arc<ScanProgress>,
41    license_engine: Option<Arc<LicenseDetectionEngine>>,
42    include_text: bool,
43    text_options: &TextDetectionOptions,
44) -> ProcessResult {
45    let mut all_files: Vec<FileInfo> = collected
46        .files
47        .par_iter()
48        .map(|(path, metadata)| {
49            let file_entry = process_file(
50                path,
51                metadata,
52                license_engine.clone(),
53                include_text,
54                text_options,
55            );
56            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
57            file_entry
58        })
59        .collect();
60
61    for (path, metadata) in &collected.directories {
62        all_files.push(process_directory(path, metadata));
63    }
64
65    ProcessResult {
66        files: all_files,
67        excluded_count: collected.excluded_count,
68    }
69}
70
71fn process_file(
72    path: &Path,
73    metadata: &fs::Metadata,
74    license_engine: Option<Arc<LicenseDetectionEngine>>,
75    include_text: bool,
76    text_options: &TextDetectionOptions,
77) -> FileInfo {
78    let mut scan_errors: Vec<String> = vec![];
79    let mut file_info_builder = FileInfoBuilder::default();
80
81    let started = Instant::now();
82
83    let mut generated_flag = None;
84    match extract_information_from_content(
85        &mut file_info_builder,
86        path,
87        license_engine,
88        include_text,
89        text_options,
90    ) {
91        Ok(is_generated) => generated_flag = is_generated,
92        Err(e) => scan_errors.push(e.to_string()),
93    };
94
95    if is_timeout_exceeded(started, text_options.timeout_seconds) {
96        scan_errors.push(format!(
97            "Processing interrupted due to timeout after {:.2} seconds",
98            text_options.timeout_seconds
99        ));
100    }
101
102    let mut file_info = file_info_builder
103        .name(path.file_name().unwrap().to_string_lossy().to_string())
104        .base_name(
105            path.file_stem()
106                .unwrap_or_default()
107                .to_string_lossy()
108                .to_string(),
109        )
110        .extension(
111            path.extension()
112                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
113        )
114        .path(path.to_string_lossy().to_string())
115        .file_type(FileType::File)
116        .mime_type(Some(
117            from_path(path)
118                .first_or_octet_stream()
119                .essence_str()
120                .to_string(),
121        ))
122        .size(metadata.len())
123        .date(get_creation_date(metadata))
124        .scan_errors(scan_errors)
125        .build()
126        .expect("FileInformationBuild not completely initialized");
127
128    if file_info.programming_language.as_deref() == Some("Go")
129        && is_go_non_production_source(path).unwrap_or(false)
130    {
131        file_info.is_source = Some(false);
132    }
133
134    if text_options.detect_generated {
135        file_info.is_generated = Some(generated_flag.unwrap_or(false));
136    }
137
138    if let (Some(scan_results_dir), Some(sha256)) = (
139        text_options.scan_cache_dir.as_deref(),
140        file_info.sha256.as_deref(),
141    ) && file_info.scan_errors.is_empty()
142    {
143        let findings = CachedScanFindings::from_file_info(&file_info);
144        let options_fingerprint = scan_cache_fingerprint(text_options);
145        if let Err(err) =
146            write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
147        {
148            file_info
149                .scan_errors
150                .push(format!("Failed to write scan cache entry: {err}"));
151        }
152    }
153
154    file_info
155}
156
157fn extract_information_from_content(
158    file_info_builder: &mut FileInfoBuilder,
159    path: &Path,
160    license_engine: Option<Arc<LicenseDetectionEngine>>,
161    include_text: bool,
162    text_options: &TextDetectionOptions,
163) -> Result<Option<bool>, Error> {
164    let started = Instant::now();
165    let buffer = fs::read(path)?;
166
167    if is_timeout_exceeded(started, text_options.timeout_seconds) {
168        return Err(Error::msg(format!(
169            "Timeout while reading file content (> {:.2}s)",
170            text_options.timeout_seconds
171        )));
172    }
173
174    let sha256 = calculate_sha256(&buffer);
175    let is_generated = text_options
176        .detect_generated
177        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
178
179    file_info_builder
180        .sha1(Some(calculate_sha1(&buffer)))
181        .md5(Some(calculate_md5(&buffer)))
182        .sha256(Some(sha256.clone()))
183        .programming_language(Some(detect_language(path, &buffer)));
184
185    if should_skip_text_detection(path, &buffer) {
186        return Ok(is_generated);
187    }
188
189    if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
190        let options_fingerprint = scan_cache_fingerprint(text_options);
191        match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
192            Ok(Some(findings)) => {
193                file_info_builder
194                    .package_data(findings.package_data)
195                    .license_expression(findings.license_expression)
196                    .license_detections(findings.license_detections)
197                    .copyrights(findings.copyrights)
198                    .holders(findings.holders)
199                    .authors(findings.authors)
200                    .emails(findings.emails)
201                    .urls(findings.urls)
202                    .programming_language(findings.programming_language);
203                return Ok(is_generated);
204            }
205            Ok(None) => {}
206            Err(err) => {
207                warn!("Failed to read scan cache for {:?}: {}", path, err);
208            }
209        }
210    }
211
212    // Package parsing and text-based detection (copyright, license) are independent.
213    // Python ScanCode runs all enabled plugins on every file, so we do the same.
214    if text_options.detect_packages
215        && let Some(package_data) = try_parse_file(path)
216    {
217        file_info_builder.package_data(package_data);
218    }
219
220    if is_timeout_exceeded(started, text_options.timeout_seconds) {
221        return Err(Error::msg(format!(
222            "Timeout while extracting package/text metadata (> {:.2}s)",
223            text_options.timeout_seconds
224        )));
225    }
226
227    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
228    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
229
230    if is_timeout_exceeded(started, text_options.timeout_seconds) {
231        return Err(Error::msg(format!(
232            "Timeout while extracting text content (> {:.2}s)",
233            text_options.timeout_seconds
234        )));
235    }
236
237    if text_content.is_empty() {
238        return Ok(is_generated);
239    }
240
241    if text_options.detect_copyrights {
242        extract_copyright_information(
243            file_info_builder,
244            path,
245            &text_content,
246            text_options.timeout_seconds,
247            from_binary_strings,
248        );
249    }
250    extract_email_url_information(file_info_builder, &text_content, text_options);
251
252    if is_timeout_exceeded(started, text_options.timeout_seconds) {
253        return Err(Error::msg(format!(
254            "Timeout before license scan (> {:.2}s)",
255            text_options.timeout_seconds
256        )));
257    }
258    // Handle source map files specially
259    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
260        if let Some(sourcemap_content) =
261            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
262        {
263            sourcemap_content
264        } else {
265            text_content
266        }
267    } else if is_source(path) {
268        remove_verbatim_escape_sequences(&text_content)
269    } else {
270        text_content
271    };
272
273    extract_license_information(
274        file_info_builder,
275        text_content_for_license_detection,
276        license_engine,
277        include_text,
278        from_binary_strings,
279    )?;
280
281    Ok(is_generated)
282}
283
284fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
285    timeout_seconds.is_finite()
286        && timeout_seconds > 0.0
287        && started.elapsed().as_secs_f64() > timeout_seconds
288}
289
290fn scan_cache_fingerprint(text_options: &TextDetectionOptions) -> String {
291    format!(
292        "packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6}",
293        text_options.detect_packages,
294        text_options.detect_copyrights,
295        text_options.detect_emails,
296        text_options.detect_urls,
297        text_options.max_emails,
298        text_options.max_urls,
299        text_options.timeout_seconds,
300    )
301}
302
303fn extract_copyright_information(
304    file_info_builder: &mut FileInfoBuilder,
305    path: &Path,
306    text_content: &str,
307    timeout_seconds: f64,
308    from_binary_strings: bool,
309) {
310    // CREDITS files get special handling (Linux kernel style).
311    if copyright::is_credits_file(path) {
312        let author_detections = copyright::detect_credits_authors(text_content);
313        if !author_detections.is_empty() {
314            file_info_builder.authors(
315                author_detections
316                    .into_iter()
317                    .map(|a| Author {
318                        author: a.author,
319                        start_line: a.start_line,
320                        end_line: a.end_line,
321                    })
322                    .collect(),
323            );
324            return;
325        }
326    }
327
328    let copyright_options = CopyrightDetectionOptions {
329        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
330            Some(Duration::from_secs_f64(timeout_seconds))
331        } else {
332            None
333        },
334        ..CopyrightDetectionOptions::default()
335    };
336
337    let (copyrights, holders, authors) =
338        copyright::detect_copyrights_with_options(text_content, &copyright_options);
339    let (copyrights, holders, authors) = if from_binary_strings {
340        prune_binary_string_detections(copyrights, holders, authors)
341    } else {
342        (copyrights, holders, authors)
343    };
344
345    file_info_builder.copyrights(
346        copyrights
347            .into_iter()
348            .map(|c| Copyright {
349                copyright: c.copyright,
350                start_line: c.start_line,
351                end_line: c.end_line,
352            })
353            .collect::<Vec<Copyright>>(),
354    );
355    file_info_builder.holders(
356        holders
357            .into_iter()
358            .map(|h| Holder {
359                holder: h.holder,
360                start_line: h.start_line,
361                end_line: h.end_line,
362            })
363            .collect::<Vec<Holder>>(),
364    );
365    file_info_builder.authors(
366        authors
367            .into_iter()
368            .map(|a| Author {
369                author: a.author,
370                start_line: a.start_line,
371                end_line: a.end_line,
372            })
373            .collect::<Vec<Author>>(),
374    );
375}
376
377fn prune_binary_string_detections(
378    copyrights: Vec<CopyrightDetection>,
379    holders: Vec<HolderDetection>,
380    _authors: Vec<AuthorDetection>,
381) -> (
382    Vec<CopyrightDetection>,
383    Vec<HolderDetection>,
384    Vec<AuthorDetection>,
385) {
386    let kept_copyrights: Vec<CopyrightDetection> = copyrights
387        .into_iter()
388        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
389        .collect();
390
391    let kept_holders: Vec<HolderDetection> = holders
392        .into_iter()
393        .filter(|holder| {
394            kept_copyrights.iter().any(|copyright| {
395                ranges_overlap(
396                    holder.start_line,
397                    holder.end_line,
398                    copyright.start_line,
399                    copyright.end_line,
400                )
401            })
402        })
403        .collect();
404
405    (kept_copyrights, kept_holders, Vec::new())
406}
407
408fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
409    a_start <= b_end && b_start <= a_end
410}
411
412fn is_binary_string_copyright_candidate(text: &str) -> bool {
413    if has_explicit_copyright_marker(text) || contains_year(text) {
414        return true;
415    }
416
417    let lower = text.to_ascii_lowercase();
418    let Some(tail) = lower.strip_prefix("copyright") else {
419        return true;
420    };
421    let tail = tail.trim();
422    let alpha_tokens: Vec<&str> = tail
423        .split_whitespace()
424        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
425        .collect();
426
427    if alpha_tokens.len() <= 1 {
428        return true;
429    }
430
431    if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
432        return true;
433    }
434
435    alpha_tokens
436        .iter()
437        .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
438}
439
440fn has_explicit_copyright_marker(text: &str) -> bool {
441    let lower = text.to_ascii_lowercase();
442    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
443}
444
445fn contains_year(text: &str) -> bool {
446    let bytes = text.as_bytes();
447    bytes.windows(4).any(|window| {
448        window.iter().all(|b| b.is_ascii_digit())
449            && matches!(window[0], b'1' | b'2')
450            && matches!(window[1], b'9' | b'0')
451    })
452}
453
454fn is_company_like_suffix(token: &str) -> bool {
455    matches!(
456        token.to_ascii_lowercase().as_str(),
457        "inc"
458            | "corp"
459            | "corporation"
460            | "co"
461            | "company"
462            | "ltd"
463            | "llc"
464            | "gmbh"
465            | "foundation"
466            | "project"
467            | "systems"
468            | "software"
469            | "technologies"
470            | "technology"
471    )
472}
473
474fn extract_email_url_information(
475    file_info_builder: &mut FileInfoBuilder,
476    text_content: &str,
477    text_options: &TextDetectionOptions,
478) {
479    if !text_options.detect_emails && !text_options.detect_urls {
480        return;
481    }
482
483    if text_options.detect_emails {
484        let config = DetectionConfig {
485            max_emails: text_options.max_emails,
486            max_urls: text_options.max_urls,
487            unique: false,
488        };
489        let emails = finder::find_emails(text_content, &config)
490            .into_iter()
491            .map(|d| OutputEmail {
492                email: d.email,
493                start_line: d.start_line,
494                end_line: d.end_line,
495            })
496            .collect::<Vec<_>>();
497        file_info_builder.emails(emails);
498    }
499
500    if text_options.detect_urls {
501        let config = DetectionConfig {
502            max_emails: text_options.max_emails,
503            max_urls: text_options.max_urls,
504            unique: true,
505        };
506        let urls = finder::find_urls(text_content, &config)
507            .into_iter()
508            .map(|d| OutputURL {
509                url: d.url,
510                start_line: d.start_line,
511                end_line: d.end_line,
512            })
513            .collect::<Vec<_>>();
514        file_info_builder.urls(urls);
515    }
516}
517
518fn extract_license_information(
519    file_info_builder: &mut FileInfoBuilder,
520    text_content: String,
521    license_engine: Option<Arc<LicenseDetectionEngine>>,
522    include_text: bool,
523    from_binary_strings: bool,
524) -> Result<(), Error> {
525    let Some(engine) = license_engine else {
526        return Ok(());
527    };
528
529    match engine.detect_with_kind(&text_content, false, from_binary_strings) {
530        Ok(detections) => {
531            let model_detections: Vec<LicenseDetection> = detections
532                .into_iter()
533                .filter_map(|d| convert_detection_to_model(d, include_text, &text_content))
534                .collect();
535
536            if !model_detections.is_empty() {
537                let expressions: Vec<String> = model_detections
538                    .iter()
539                    .filter(|d| !d.license_expression_spdx.is_empty())
540                    .map(|d| d.license_expression_spdx.clone())
541                    .collect();
542
543                if !expressions.is_empty() {
544                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
545                    if let Some(expr) = combined {
546                        file_info_builder.license_expression(Some(expr));
547                    }
548                }
549            }
550
551            file_info_builder.license_detections(model_detections);
552        }
553        Err(e) => {
554            warn!("License detection failed: {}", e);
555        }
556    }
557
558    Ok(())
559}
560
561fn convert_detection_to_model(
562    detection: crate::license_detection::LicenseDetection,
563    include_text: bool,
564    text_content: &str,
565) -> Option<LicenseDetection> {
566    let license_expression = detection.license_expression?;
567    let license_expression_spdx = detection.license_expression_spdx.unwrap_or_default();
568
569    let matches: Vec<Match> = detection
570        .matches
571        .into_iter()
572        .map(|m| {
573            let matched_text = if include_text {
574                m.matched_text.or_else(|| {
575                    Some(crate::license_detection::query::matched_text_from_text(
576                        text_content,
577                        m.start_line,
578                        m.end_line,
579                    ))
580                })
581            } else {
582                None
583            };
584            Match {
585                license_expression: m.license_expression,
586                license_expression_spdx: m.license_expression_spdx.unwrap_or_default(),
587                from_file: m.from_file,
588                start_line: m.start_line,
589                end_line: m.end_line,
590                matcher: Some(m.matcher.to_string()),
591                score: m.score as f64,
592                matched_length: Some(m.matched_length),
593                match_coverage: Some(m.match_coverage as f64),
594                rule_relevance: Some(m.rule_relevance as usize),
595                rule_identifier: Some(m.rule_identifier),
596                rule_url: Some(m.rule_url),
597                matched_text,
598            }
599        })
600        .collect();
601
602    Some(LicenseDetection {
603        license_expression,
604        license_expression_spdx,
605        matches,
606        identifier: detection.identifier,
607    })
608}
609
610fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
611    is_pem_certificate_file(path, buffer)
612}
613
614fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
615    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
616        return Ok(false);
617    }
618
619    if path
620        .file_name()
621        .and_then(|name| name.to_str())
622        .is_some_and(|name| name.ends_with("_test.go"))
623    {
624        return Ok(true);
625    }
626
627    let content = fs::read_to_string(path)?;
628    Ok(content.lines().take(10).any(|line| {
629        let trimmed = line.trim();
630        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
631            && trimmed.split_whitespace().any(|token| token == "test")
632    }))
633}
634
635fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
636    let prefix_len = buffer.len().min(8192);
637    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
638    let trimmed_lines: Vec<&str> = prefix
639        .lines()
640        .map(str::trim)
641        .filter(|line| !line.is_empty())
642        .take(64)
643        .collect();
644
645    PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
646        trimmed_lines.iter().any(|line| line == begin)
647            && trimmed_lines.iter().any(|line| line == end)
648    })
649}
650
651fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
652    let name = path
653        .file_name()
654        .unwrap_or_default()
655        .to_string_lossy()
656        .to_string();
657    let base_name = name.clone(); // For directories, base_name is the same as name
658
659    FileInfo {
660        name,
661        base_name,
662        extension: "".to_string(),
663        path: path.to_string_lossy().to_string(),
664        file_type: FileType::Directory,
665        mime_type: None,
666        size: 0,
667        date: get_creation_date(metadata),
668        sha1: None,
669        md5: None,
670        sha256: None,
671        programming_language: None,
672        package_data: Vec::new(), // TODO: implement
673        license_expression: None,
674        copyrights: Vec::new(),         // TODO: implement
675        holders: Vec::new(),            // TODO: implement
676        authors: Vec::new(),            // TODO: implement
677        emails: Vec::new(),             // TODO: implement
678        license_detections: Vec::new(), // TODO: implement
679        urls: Vec::new(),               // TODO: implement
680        for_packages: Vec::new(),
681        scan_errors: Vec::new(),
682        is_source: None,
683        source_count: None,
684        is_legal: false,
685        is_manifest: false,
686        is_readme: false,
687        is_top_level: false,
688        is_key_file: false,
689        is_community: false,
690        is_generated: None,
691        facets: vec![],
692        tallies: None,
693    }
694}
695
696#[cfg(test)]
697mod tests {
698    use super::is_go_non_production_source;
699    use std::fs;
700    use tempfile::tempdir;
701
702    #[test]
703    fn test_is_go_non_production_source_for_test_filename() {
704        let temp_dir = tempdir().unwrap();
705        let path = temp_dir.path().join("scanner_test.go");
706        fs::write(&path, "package scanner\n").unwrap();
707
708        assert!(is_go_non_production_source(&path).unwrap());
709    }
710
711    #[test]
712    fn test_is_go_non_production_source_for_build_tag() {
713        let temp_dir = tempdir().unwrap();
714        let path = temp_dir.path().join("scanner.go");
715        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
716
717        assert!(is_go_non_production_source(&path).unwrap());
718    }
719
720    #[test]
721    fn test_is_go_non_production_source_for_regular_go_file() {
722        let temp_dir = tempdir().unwrap();
723        let path = temp_dir.path().join("scanner.go");
724        fs::write(&path, "package scanner\n").unwrap();
725
726        assert!(!is_go_non_production_source(&path).unwrap());
727    }
728}