Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::fs::{self, File};
10use std::io::{Read, Write};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
16use crate::copyright::{
17    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
21use crate::license_detection::query::Query;
22use crate::models::{
23    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
24    Match, OutputEmail, OutputURL,
25};
26use crate::progress::ScanProgress;
27use crate::scanner::collect::CollectedPaths;
28use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
29use crate::utils::file::{
30    ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
31};
32use crate::utils::generated::generated_code_hints_from_bytes;
33use tempfile::TempDir;
34
35const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
36    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
37    (
38        "-----BEGIN TRUSTED CERTIFICATE-----",
39        "-----END TRUSTED CERTIFICATE-----",
40    ),
41];
42
43pub fn process_collected(
44    collected: &CollectedPaths,
45    progress: Arc<ScanProgress>,
46    license_engine: Option<Arc<LicenseDetectionEngine>>,
47    license_options: LicenseScanOptions,
48    text_options: &TextDetectionOptions,
49) -> ProcessResult {
50    let mut all_files: Vec<FileInfo> = collected
51        .files
52        .par_iter()
53        .map(|(path, metadata)| {
54            let file_entry = process_file(
55                path,
56                metadata,
57                license_engine.clone(),
58                license_options,
59                text_options,
60            );
61            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
62            file_entry
63        })
64        .collect();
65
66    for (path, metadata) in &collected.directories {
67        all_files.push(process_directory(
68            path,
69            metadata,
70            text_options.collect_info,
71            license_engine.is_some(),
72        ));
73    }
74
75    ProcessResult {
76        files: all_files,
77        excluded_count: collected.excluded_count,
78    }
79}
80
81pub fn process_collected_with_memory_limit(
82    collected: &CollectedPaths,
83    progress: Arc<ScanProgress>,
84    license_engine: Option<Arc<LicenseDetectionEngine>>,
85    license_options: LicenseScanOptions,
86    text_options: &TextDetectionOptions,
87    max_in_memory: i64,
88) -> ProcessResult {
89    if max_in_memory == 0 {
90        return process_collected(
91            collected,
92            progress,
93            license_engine,
94            license_options,
95            text_options,
96        );
97    }
98
99    let memory_limit = if max_in_memory < 0 {
100        0
101    } else {
102        max_in_memory as usize
103    };
104    let chunk_size = if max_in_memory < 0 {
105        256
106    } else {
107        memory_limit.max(1)
108    };
109
110    let mut retained_files = Vec::new();
111    let mut spill_store = None;
112
113    for chunk in collected.files.chunks(chunk_size) {
114        let processed_chunk: Vec<FileInfo> = chunk
115            .par_iter()
116            .map(|(path, metadata)| {
117                let file_entry = process_file(
118                    path,
119                    metadata,
120                    license_engine.clone(),
121                    license_options,
122                    text_options,
123                );
124                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
125                file_entry
126            })
127            .collect();
128
129        retain_or_spill_chunk(
130            processed_chunk,
131            &mut retained_files,
132            &mut spill_store,
133            memory_limit,
134        );
135    }
136
137    for (path, metadata) in &collected.directories {
138        let entry = process_directory(
139            path,
140            metadata,
141            text_options.collect_info,
142            license_engine.is_some(),
143        );
144        retain_or_spill_chunk(
145            vec![entry],
146            &mut retained_files,
147            &mut spill_store,
148            memory_limit,
149        );
150    }
151
152    if let Some(spill_store) = spill_store {
153        retained_files.extend(spill_store.load_all());
154    }
155
156    ProcessResult {
157        files: retained_files,
158        excluded_count: collected.excluded_count,
159    }
160}
161
162fn retain_or_spill_chunk(
163    chunk: Vec<FileInfo>,
164    retained_files: &mut Vec<FileInfo>,
165    spill_store: &mut Option<FileInfoSpillStore>,
166    memory_limit: usize,
167) {
168    if memory_limit == 0 {
169        spill_store
170            .get_or_insert_with(FileInfoSpillStore::new)
171            .spill(chunk);
172        return;
173    }
174
175    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
176    if remaining_capacity >= chunk.len() && spill_store.is_none() {
177        retained_files.extend(chunk);
178        return;
179    }
180
181    let mut chunk_iter = chunk.into_iter();
182    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
183    let overflow: Vec<FileInfo> = chunk_iter.collect();
184    if !overflow.is_empty() {
185        spill_store
186            .get_or_insert_with(FileInfoSpillStore::new)
187            .spill(overflow);
188    }
189}
190
191struct FileInfoSpillStore {
192    temp_dir: TempDir,
193    batch_index: usize,
194}
195
196impl FileInfoSpillStore {
197    fn new() -> Self {
198        Self {
199            temp_dir: TempDir::new().expect("create spill dir"),
200            batch_index: 0,
201        }
202    }
203
204    fn spill(&mut self, files: Vec<FileInfo>) {
205        let path = self
206            .temp_dir
207            .path()
208            .join(format!("batch-{:06}.json.zst", self.batch_index));
209        self.batch_index += 1;
210
211        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
212        let file = File::create(path).expect("create spill batch file");
213        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
214        encoder
215            .write_all(&payload)
216            .expect("write spilled file batch");
217        encoder.finish().expect("finish spill encoder");
218    }
219
220    fn load_all(self) -> Vec<FileInfo> {
221        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
222            .expect("read spill dir")
223            .filter_map(Result::ok)
224            .map(|entry| entry.path())
225            .collect();
226        paths.sort();
227
228        let mut files = Vec::new();
229        for path in paths {
230            let file = File::open(path).expect("open spill batch");
231            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
232            let mut payload = Vec::new();
233            decoder.read_to_end(&mut payload).expect("read spill batch");
234            let mut batch: Vec<FileInfo> =
235                serde_json::from_slice(&payload).expect("decode spilled file batch");
236            files.append(&mut batch);
237        }
238        files
239    }
240}
241
242fn process_file(
243    path: &Path,
244    metadata: &fs::Metadata,
245    license_engine: Option<Arc<LicenseDetectionEngine>>,
246    license_options: LicenseScanOptions,
247    text_options: &TextDetectionOptions,
248) -> FileInfo {
249    let mut scan_errors: Vec<String> = vec![];
250    let mut file_info_builder = FileInfoBuilder::default();
251    let license_enabled = license_engine.is_some();
252
253    let started = Instant::now();
254
255    let mut generated_flag = None;
256    let mut is_source_file = false;
257    let mut cache_key_sha256 = None;
258    match extract_information_from_content(
259        &mut file_info_builder,
260        &mut scan_errors,
261        path,
262        license_engine,
263        license_options,
264        text_options,
265    ) {
266        Ok((is_generated, sha256, is_source)) => {
267            generated_flag = is_generated;
268            cache_key_sha256 = Some(sha256);
269            is_source_file = is_source;
270        }
271        Err(e) => scan_errors.push(e.to_string()),
272    };
273
274    if is_timeout_exceeded(started, text_options.timeout_seconds) {
275        scan_errors.push(format!(
276            "Processing interrupted due to timeout after {:.2} seconds",
277            text_options.timeout_seconds
278        ));
279    }
280
281    let mut file_info = file_info_builder
282        .name(path.file_name().unwrap().to_string_lossy().to_string())
283        .base_name(
284            path.file_stem()
285                .unwrap_or_default()
286                .to_string_lossy()
287                .to_string(),
288        )
289        .extension(
290            path.extension()
291                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
292        )
293        .path(path.to_string_lossy().to_string())
294        .file_type(FileType::File)
295        .size(metadata.len())
296        .date(
297            text_options
298                .collect_info
299                .then(|| get_creation_date(metadata))
300                .flatten(),
301        )
302        .scan_errors(scan_errors)
303        .build()
304        .expect("FileInformationBuild not completely initialized");
305
306    if text_options.collect_info {
307        file_info.is_source = Some(is_source_file);
308    }
309
310    if file_info.programming_language.as_deref() == Some("Go")
311        && is_go_non_production_source(path).unwrap_or(false)
312    {
313        file_info.is_source = Some(false);
314    }
315
316    if text_options.detect_generated {
317        file_info.is_generated = Some(generated_flag.unwrap_or(false));
318    }
319
320    if file_info.percentage_of_license_text.is_none() && license_enabled {
321        file_info.percentage_of_license_text = Some(0.0);
322    }
323
324    if let (Some(scan_results_dir), Some(sha256)) = (
325        text_options.scan_cache_dir.as_deref(),
326        cache_key_sha256.as_deref(),
327    ) && file_info.scan_errors.is_empty()
328    {
329        let findings = CachedScanFindings::from_file_info(&file_info);
330        let options_fingerprint =
331            scan_cache_fingerprint(text_options, license_options, license_enabled);
332        if let Err(err) =
333            write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
334        {
335            file_info
336                .scan_errors
337                .push(format!("Failed to write scan cache entry: {err}"));
338        }
339    }
340
341    file_info
342}
343
344fn extract_information_from_content(
345    file_info_builder: &mut FileInfoBuilder,
346    scan_errors: &mut Vec<String>,
347    path: &Path,
348    license_engine: Option<Arc<LicenseDetectionEngine>>,
349    license_options: LicenseScanOptions,
350    text_options: &TextDetectionOptions,
351) -> Result<(Option<bool>, String, bool), Error> {
352    let started = Instant::now();
353    let buffer = fs::read(path)?;
354    let license_enabled = license_engine.is_some();
355
356    if is_timeout_exceeded(started, text_options.timeout_seconds) {
357        return Err(Error::msg(format!(
358            "Timeout while reading file content (> {:.2}s)",
359            text_options.timeout_seconds
360        )));
361    }
362
363    let sha256 = calculate_sha256(&buffer);
364    let is_generated = text_options
365        .detect_generated
366        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
367    let classification = classify_file_info(path, &buffer);
368
369    if text_options.collect_info {
370        file_info_builder
371            .sha1(Some(calculate_sha1(&buffer)))
372            .md5(Some(calculate_md5(&buffer)))
373            .sha256(Some(sha256.clone()))
374            .programming_language(classification.programming_language.clone())
375            .mime_type(Some(classification.mime_type.clone()))
376            .file_type_label(Some(classification.file_type.clone()))
377            .sha1_git(Some(calculate_sha1_git(&buffer)))
378            .is_binary(Some(classification.is_binary))
379            .is_text(Some(classification.is_text))
380            .is_archive(Some(classification.is_archive))
381            .is_media(Some(classification.is_media))
382            .is_source(Some(classification.is_source))
383            .is_script(Some(classification.is_script))
384            .files_count(Some(0))
385            .dirs_count(Some(0))
386            .size_count(Some(0));
387    }
388
389    if should_skip_text_detection(path, &buffer) {
390        return Ok((is_generated, sha256, classification.is_source));
391    }
392
393    if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
394        let options_fingerprint =
395            scan_cache_fingerprint(text_options, license_options, license_enabled);
396        match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
397            Ok(Some(findings)) => {
398                file_info_builder
399                    .package_data(findings.package_data)
400                    .license_expression(findings.license_expression)
401                    .license_detections(findings.license_detections)
402                    .license_clues(findings.license_clues)
403                    .percentage_of_license_text(findings.percentage_of_license_text)
404                    .copyrights(findings.copyrights)
405                    .holders(findings.holders)
406                    .authors(findings.authors)
407                    .emails(findings.emails)
408                    .urls(findings.urls)
409                    .programming_language(findings.programming_language);
410                return Ok((is_generated, sha256, classification.is_source));
411            }
412            Ok(None) => {}
413            Err(err) => {
414                scan_errors.push(format!("Failed to read scan cache for {:?}: {}", path, err));
415            }
416        }
417    }
418
419    // Package parsing and text-based detection (copyright, license) are independent.
420    // Python ScanCode runs all enabled plugins on every file, so we do the same.
421    if text_options.detect_packages {
422        let parse_result = try_parse_file(path).or_else(|| {
423            text_options
424                .detect_packages_in_compiled
425                .then(|| try_parse_compiled_bytes(&buffer))
426                .flatten()
427        });
428
429        if let Some(parse_result) = parse_result {
430            let packages = parse_result
431                .packages
432                .into_iter()
433                .filter(|package| {
434                    let is_compiled_package = package
435                        .datasource_id
436                        .as_ref()
437                        .is_some_and(is_compiled_datasource);
438                    let is_system_package = package
439                        .datasource_id
440                        .as_ref()
441                        .is_some_and(is_system_datasource);
442                    if is_compiled_package {
443                        text_options.detect_packages_in_compiled
444                    } else if is_system_package {
445                        text_options.detect_system_packages
446                    } else {
447                        text_options.detect_application_packages
448                    }
449                })
450                .collect();
451            file_info_builder.package_data(packages);
452            scan_errors.extend(parse_result.scan_errors);
453        }
454    }
455
456    if is_timeout_exceeded(started, text_options.timeout_seconds) {
457        return Err(Error::msg(format!(
458            "Timeout while extracting package/text metadata (> {:.2}s)",
459            text_options.timeout_seconds
460        )));
461    }
462
463    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
464    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
465
466    if is_timeout_exceeded(started, text_options.timeout_seconds) {
467        return Err(Error::msg(format!(
468            "Timeout while extracting text content (> {:.2}s)",
469            text_options.timeout_seconds
470        )));
471    }
472
473    if text_content.is_empty() {
474        return Ok((is_generated, sha256, classification.is_source));
475    }
476
477    if text_options.detect_copyrights {
478        extract_copyright_information(
479            file_info_builder,
480            path,
481            &text_content,
482            text_options.timeout_seconds,
483            from_binary_strings,
484        );
485    }
486    extract_email_url_information(file_info_builder, &text_content, text_options);
487
488    if is_timeout_exceeded(started, text_options.timeout_seconds) {
489        return Err(Error::msg(format!(
490            "Timeout before license scan (> {:.2}s)",
491            text_options.timeout_seconds
492        )));
493    }
494    // Handle source map files specially
495    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
496        if let Some(sourcemap_content) =
497            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
498        {
499            sourcemap_content
500        } else {
501            text_content
502        }
503    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
504        remove_verbatim_escape_sequences(&text_content)
505    } else {
506        text_content
507    };
508
509    extract_license_information(
510        file_info_builder,
511        scan_errors,
512        path,
513        text_content_for_license_detection,
514        license_engine,
515        license_options,
516        from_binary_strings,
517    )?;
518
519    Ok((is_generated, sha256, classification.is_source))
520}
521
522fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
523    timeout_seconds.is_finite()
524        && timeout_seconds > 0.0
525        && started.elapsed().as_secs_f64() > timeout_seconds
526}
527
528fn scan_cache_fingerprint(
529    text_options: &TextDetectionOptions,
530    license_options: LicenseScanOptions,
531    license_enabled: bool,
532) -> String {
533    format!(
534        "info={};packages={};app_packages={};system_packages={};compiled_packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={};license_score={}",
535        text_options.collect_info,
536        text_options.detect_packages,
537        text_options.detect_application_packages,
538        text_options.detect_system_packages,
539        text_options.detect_packages_in_compiled,
540        text_options.detect_copyrights,
541        text_options.detect_emails,
542        text_options.detect_urls,
543        text_options.max_emails,
544        text_options.max_urls,
545        text_options.timeout_seconds,
546        license_enabled,
547        license_options.include_text,
548        license_options.include_text_diagnostics,
549        license_options.include_diagnostics,
550        license_options.unknown_licenses,
551        license_options.min_score,
552    )
553}
554
555fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
556    matches!(
557        datasource_id,
558        DatasourceId::AlpineInstalledDb
559            | DatasourceId::DebianDistrolessInstalledDb
560            | DatasourceId::DebianInstalledFilesList
561            | DatasourceId::DebianInstalledMd5Sums
562            | DatasourceId::DebianInstalledStatusDb
563            | DatasourceId::FreebsdCompactManifest
564            | DatasourceId::RpmInstalledDatabaseBdb
565            | DatasourceId::RpmInstalledDatabaseNdb
566            | DatasourceId::RpmInstalledDatabaseSqlite
567            | DatasourceId::RpmYumdb
568    )
569}
570
571fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
572    matches!(
573        datasource_id,
574        DatasourceId::GoBinary | DatasourceId::RustBinary
575    )
576}
577
578fn extract_copyright_information(
579    file_info_builder: &mut FileInfoBuilder,
580    path: &Path,
581    text_content: &str,
582    timeout_seconds: f64,
583    from_binary_strings: bool,
584) {
585    // CREDITS files get special handling (Linux kernel style).
586    if copyright::is_credits_file(path) {
587        let author_detections = copyright::detect_credits_authors(text_content);
588        if !author_detections.is_empty() {
589            file_info_builder.authors(
590                author_detections
591                    .into_iter()
592                    .map(|a| Author {
593                        author: a.author,
594                        start_line: a.start_line,
595                        end_line: a.end_line,
596                    })
597                    .collect(),
598            );
599            return;
600        }
601    }
602
603    let copyright_options = CopyrightDetectionOptions {
604        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
605            Some(Duration::from_secs_f64(timeout_seconds))
606        } else {
607            None
608        },
609        ..CopyrightDetectionOptions::default()
610    };
611
612    let (copyrights, holders, authors) =
613        copyright::detect_copyrights_with_options(text_content, &copyright_options);
614    let (copyrights, holders, authors) = if from_binary_strings {
615        prune_binary_string_detections(copyrights, holders, authors)
616    } else {
617        (copyrights, holders, authors)
618    };
619
620    file_info_builder.copyrights(
621        copyrights
622            .into_iter()
623            .map(|c| Copyright {
624                copyright: c.copyright,
625                start_line: c.start_line,
626                end_line: c.end_line,
627            })
628            .collect::<Vec<Copyright>>(),
629    );
630    file_info_builder.holders(
631        holders
632            .into_iter()
633            .map(|h| Holder {
634                holder: h.holder,
635                start_line: h.start_line,
636                end_line: h.end_line,
637            })
638            .collect::<Vec<Holder>>(),
639    );
640    file_info_builder.authors(
641        authors
642            .into_iter()
643            .map(|a| Author {
644                author: a.author,
645                start_line: a.start_line,
646                end_line: a.end_line,
647            })
648            .collect::<Vec<Author>>(),
649    );
650}
651
652fn prune_binary_string_detections(
653    copyrights: Vec<CopyrightDetection>,
654    holders: Vec<HolderDetection>,
655    _authors: Vec<AuthorDetection>,
656) -> (
657    Vec<CopyrightDetection>,
658    Vec<HolderDetection>,
659    Vec<AuthorDetection>,
660) {
661    let kept_copyrights: Vec<CopyrightDetection> = copyrights
662        .into_iter()
663        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
664        .collect();
665
666    let kept_holders: Vec<HolderDetection> = holders
667        .into_iter()
668        .filter(|holder| {
669            kept_copyrights.iter().any(|copyright| {
670                ranges_overlap(
671                    holder.start_line,
672                    holder.end_line,
673                    copyright.start_line,
674                    copyright.end_line,
675                )
676            })
677        })
678        .collect();
679
680    (kept_copyrights, kept_holders, Vec::new())
681}
682
683fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
684    a_start <= b_end && b_start <= a_end
685}
686
687fn is_binary_string_copyright_candidate(text: &str) -> bool {
688    if has_explicit_copyright_marker(text) || contains_year(text) {
689        return true;
690    }
691
692    let lower = text.to_ascii_lowercase();
693    let Some(tail) = lower.strip_prefix("copyright") else {
694        return true;
695    };
696    let tail = tail.trim();
697    let alpha_tokens: Vec<&str> = tail
698        .split_whitespace()
699        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
700        .collect();
701
702    if alpha_tokens.len() <= 1 {
703        return true;
704    }
705
706    if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
707        return true;
708    }
709
710    alpha_tokens
711        .iter()
712        .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
713}
714
715fn has_explicit_copyright_marker(text: &str) -> bool {
716    let lower = text.to_ascii_lowercase();
717    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
718}
719
720fn contains_year(text: &str) -> bool {
721    let bytes = text.as_bytes();
722    bytes.windows(4).any(|window| {
723        window.iter().all(|b| b.is_ascii_digit())
724            && matches!(window[0], b'1' | b'2')
725            && matches!(window[1], b'9' | b'0')
726    })
727}
728
729fn is_company_like_suffix(token: &str) -> bool {
730    matches!(
731        token.to_ascii_lowercase().as_str(),
732        "inc"
733            | "corp"
734            | "corporation"
735            | "co"
736            | "company"
737            | "ltd"
738            | "llc"
739            | "gmbh"
740            | "foundation"
741            | "project"
742            | "systems"
743            | "software"
744            | "technologies"
745            | "technology"
746    )
747}
748
749fn extract_email_url_information(
750    file_info_builder: &mut FileInfoBuilder,
751    text_content: &str,
752    text_options: &TextDetectionOptions,
753) {
754    if !text_options.detect_emails && !text_options.detect_urls {
755        return;
756    }
757
758    if text_options.detect_emails {
759        let config = DetectionConfig {
760            max_emails: text_options.max_emails,
761            max_urls: text_options.max_urls,
762            unique: false,
763        };
764        let emails = finder::find_emails(text_content, &config)
765            .into_iter()
766            .map(|d| OutputEmail {
767                email: d.email,
768                start_line: d.start_line,
769                end_line: d.end_line,
770            })
771            .collect::<Vec<_>>();
772        file_info_builder.emails(emails);
773    }
774
775    if text_options.detect_urls {
776        let config = DetectionConfig {
777            max_emails: text_options.max_emails,
778            max_urls: text_options.max_urls,
779            unique: true,
780        };
781        let urls = finder::find_urls(text_content, &config)
782            .into_iter()
783            .map(|d| OutputURL {
784                url: d.url,
785                start_line: d.start_line,
786                end_line: d.end_line,
787            })
788            .collect::<Vec<_>>();
789        file_info_builder.urls(urls);
790    }
791}
792
793fn extract_license_information(
794    file_info_builder: &mut FileInfoBuilder,
795    scan_errors: &mut Vec<String>,
796    path: &Path,
797    text_content: String,
798    license_engine: Option<Arc<LicenseDetectionEngine>>,
799    license_options: LicenseScanOptions,
800    from_binary_strings: bool,
801) -> Result<(), Error> {
802    let Some(engine) = license_engine else {
803        return Ok(());
804    };
805
806    let detection_result = if license_options.min_score == 0 {
807        engine.detect_with_kind_and_source(
808            &text_content,
809            license_options.unknown_licenses,
810            from_binary_strings,
811            &path.to_string_lossy(),
812        )
813    } else {
814        engine.detect_with_kind_and_source_with_score(
815            &text_content,
816            license_options.unknown_licenses,
817            from_binary_strings,
818            &path.to_string_lossy(),
819            license_options.min_score as f32,
820        )
821    };
822
823    match detection_result {
824        Ok(detections) => {
825            let query =
826                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
827            let mut model_detections = Vec::new();
828            let mut model_clues = Vec::new();
829
830            for detection in &detections {
831                let (public_detection, clue_matches) = convert_detection_to_model(
832                    detection,
833                    license_options,
834                    &text_content,
835                    query.as_ref(),
836                );
837
838                if let Some(public_detection) = public_detection {
839                    model_detections.push(public_detection);
840                }
841
842                model_clues.extend(clue_matches);
843            }
844
845            if !model_detections.is_empty() {
846                let expressions: Vec<String> = model_detections
847                    .iter()
848                    .filter(|d| !d.license_expression_spdx.is_empty())
849                    .map(|d| d.license_expression_spdx.clone())
850                    .collect();
851
852                if !expressions.is_empty() {
853                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
854                    if let Some(expr) = combined {
855                        file_info_builder.license_expression(Some(expr));
856                    }
857                }
858            }
859
860            file_info_builder.license_detections(model_detections);
861            file_info_builder.license_clues(model_clues);
862            file_info_builder.percentage_of_license_text(
863                query
864                    .as_ref()
865                    .map(|query| compute_percentage_of_license_text(query, &detections)),
866            );
867        }
868        Err(e) => {
869            scan_errors.push(format!("License detection failed: {}", e));
870        }
871    }
872
873    Ok(())
874}
875
876fn convert_detection_to_model(
877    detection: &crate::license_detection::LicenseDetection,
878    license_options: LicenseScanOptions,
879    text_content: &str,
880    query: Option<&Query<'_>>,
881) -> (Option<LicenseDetection>, Vec<Match>) {
882    let matches: Vec<Match> = detection
883        .matches
884        .iter()
885        .map(|m| convert_match_to_model(m, license_options, text_content, query))
886        .collect();
887
888    if let Some(license_expression) = detection.license_expression.clone() {
889        (
890            Some(LicenseDetection {
891                license_expression,
892                license_expression_spdx: detection
893                    .license_expression_spdx
894                    .clone()
895                    .unwrap_or_default(),
896                matches,
897                detection_log: if license_options.include_diagnostics {
898                    detection.detection_log.clone()
899                } else {
900                    Vec::new()
901                },
902                identifier: detection.identifier.clone(),
903            }),
904            Vec::new(),
905        )
906    } else {
907        (None, matches)
908    }
909}
910
911fn convert_match_to_model(
912    m: &crate::license_detection::models::LicenseMatch,
913    license_options: LicenseScanOptions,
914    text_content: &str,
915    query: Option<&Query<'_>>,
916) -> Match {
917    let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
918    let rule_url = if m.rule_url.is_empty() {
919        None
920    } else {
921        Some(m.rule_url.clone())
922    };
923    let matched_text = if license_options.include_text {
924        m.matched_text.clone().or_else(|| {
925            Some(crate::license_detection::query::matched_text_from_text(
926                text_content,
927                m.start_line,
928                m.end_line,
929            ))
930        })
931    } else {
932        None
933    };
934    let matched_text_diagnostics = if license_options.include_text_diagnostics {
935        query.map(|query| matched_text_diagnostics_from_match(query, m))
936    } else {
937        None
938    };
939    Match {
940        license_expression: m.license_expression.clone(),
941        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
942        from_file: m.from_file.clone(),
943        start_line: m.start_line,
944        end_line: m.end_line,
945        matcher: Some(m.matcher.to_string()),
946        score: output_metric(m.score),
947        matched_length: Some(m.matched_length),
948        match_coverage: Some(output_metric(m.coverage())),
949        rule_relevance: Some(m.rule_relevance as usize),
950        rule_identifier: Some(m.rule_identifier.clone()),
951        rule_url,
952        matched_text,
953        referenced_filenames: m.referenced_filenames.clone(),
954        matched_text_diagnostics,
955    }
956}
957
958fn compute_percentage_of_license_text(
959    query: &Query<'_>,
960    detections: &[crate::license_detection::LicenseDetection],
961) -> f64 {
962    let matched_positions: std::collections::HashSet<usize> = detections
963        .iter()
964        .flat_map(|detection| detection.matches.iter())
965        .flat_map(InternalLicenseMatch::qspan)
966        .collect();
967
968    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
969    if query_tokens_length == 0 {
970        return 0.0;
971    }
972
973    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
974    (percentage * 100.0).round() / 100.0
975}
976
977fn matched_text_diagnostics_from_match(
978    query: &Query<'_>,
979    license_match: &InternalLicenseMatch,
980) -> String {
981    let matched_positions: std::collections::HashSet<usize> =
982        license_match.qspan().into_iter().collect();
983    let Some(start_pos) = matched_positions.iter().min().copied() else {
984        return crate::license_detection::query::matched_text_from_text(
985            &query.text,
986            license_match.start_line,
987            license_match.end_line,
988        );
989    };
990    let Some(end_pos) = matched_positions.iter().max().copied() else {
991        return crate::license_detection::query::matched_text_from_text(
992            &query.text,
993            license_match.start_line,
994            license_match.end_line,
995        );
996    };
997
998    crate::license_detection::query::matched_text_diagnostics_from_text(
999        &query.text,
1000        query,
1001        &matched_positions,
1002        start_pos,
1003        end_pos,
1004        license_match.start_line,
1005        license_match.end_line,
1006    )
1007}
1008
1009fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1010    is_pem_certificate_file(path, buffer)
1011}
1012
1013fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1014    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1015        return Ok(false);
1016    }
1017
1018    if path
1019        .file_name()
1020        .and_then(|name| name.to_str())
1021        .is_some_and(|name| name.ends_with("_test.go"))
1022    {
1023        return Ok(true);
1024    }
1025
1026    let content = fs::read_to_string(path)?;
1027    Ok(content.lines().take(10).any(|line| {
1028        let trimmed = line.trim();
1029        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1030            && trimmed.split_whitespace().any(|token| token == "test")
1031    }))
1032}
1033
1034fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1035    let prefix_len = buffer.len().min(8192);
1036    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1037    let trimmed_lines: Vec<&str> = prefix
1038        .lines()
1039        .map(str::trim)
1040        .filter(|line| !line.is_empty())
1041        .take(64)
1042        .collect();
1043
1044    PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
1045        trimmed_lines.iter().any(|line| line == begin)
1046            && trimmed_lines.iter().any(|line| line == end)
1047    })
1048}
1049
1050fn process_directory(
1051    path: &Path,
1052    _metadata: &fs::Metadata,
1053    collect_info: bool,
1054    license_enabled: bool,
1055) -> FileInfo {
1056    let name = path
1057        .file_name()
1058        .unwrap_or_default()
1059        .to_string_lossy()
1060        .to_string();
1061    let base_name = name.clone(); // For directories, base_name is the same as name
1062
1063    FileInfo {
1064        name,
1065        base_name,
1066        extension: "".to_string(),
1067        path: path.to_string_lossy().to_string(),
1068        file_type: FileType::Directory,
1069        mime_type: None,
1070        file_type_label: None,
1071        size: 0,
1072        date: None,
1073        sha1: None,
1074        md5: None,
1075        sha256: None,
1076        sha1_git: None,
1077        programming_language: None,
1078        package_data: Vec::new(),
1079        license_expression: None,
1080        license_detections: Vec::new(),
1081        license_clues: Vec::new(),
1082        percentage_of_license_text: license_enabled.then_some(0.0),
1083        copyrights: Vec::new(),
1084        holders: Vec::new(),
1085        authors: Vec::new(),
1086        emails: Vec::new(),
1087        urls: Vec::new(),
1088        for_packages: Vec::new(),
1089        scan_errors: Vec::new(),
1090        license_policy: None,
1091        is_binary: collect_info.then_some(false),
1092        is_text: collect_info.then_some(false),
1093        is_archive: collect_info.then_some(false),
1094        is_media: collect_info.then_some(false),
1095        is_source: collect_info.then_some(false),
1096        is_script: collect_info.then_some(false),
1097        files_count: collect_info.then_some(0),
1098        dirs_count: collect_info.then_some(0),
1099        size_count: collect_info.then_some(0),
1100        source_count: None,
1101        is_legal: false,
1102        is_manifest: false,
1103        is_readme: false,
1104        is_top_level: false,
1105        is_key_file: false,
1106        is_community: false,
1107        is_generated: None,
1108        facets: vec![],
1109        tallies: None,
1110    }
1111}
1112
1113#[cfg(test)]
1114mod tests {
1115    use super::{
1116        compute_percentage_of_license_text, convert_detection_to_model,
1117        is_go_non_production_source, scan_cache_fingerprint,
1118    };
1119    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1120    use crate::license_detection::index::LicenseIndex;
1121    use crate::license_detection::index::dictionary::TokenDictionary;
1122    use crate::license_detection::models::{LicenseMatch, MatcherKind, RuleKind};
1123    use crate::license_detection::query::Query;
1124    use crate::scanner::LicenseScanOptions;
1125    use std::fs;
1126    use tempfile::tempdir;
1127
1128    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1129        LicenseMatch {
1130            rid: 0,
1131            license_expression: "mit".to_string(),
1132            license_expression_spdx: Some("MIT".to_string()),
1133            from_file: None,
1134            start_line: 1,
1135            end_line: 1,
1136            start_token: 0,
1137            end_token: 1,
1138            matcher: MatcherKind::Hash,
1139            score: 1.0,
1140            matched_length: 3,
1141            rule_length: 3,
1142            match_coverage: 100.0,
1143            rule_relevance: 100,
1144            rule_identifier: "mit.LICENSE".to_string(),
1145            rule_url: rule_url.to_string(),
1146            matched_text: Some("MIT".to_string()),
1147            referenced_filenames: None,
1148            rule_kind: RuleKind::Text,
1149            is_from_license: true,
1150            matched_token_positions: None,
1151            hilen: 3,
1152            rule_start_token: 0,
1153            qspan_positions: None,
1154            ispan_positions: None,
1155            hispan_positions: None,
1156            candidate_resemblance: 0.0,
1157            candidate_containment: 0.0,
1158        }
1159    }
1160
1161    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1162        InternalLicenseDetection {
1163            license_expression: Some("mit".to_string()),
1164            license_expression_spdx: Some("MIT".to_string()),
1165            matches: vec![make_internal_match(rule_url)],
1166            detection_log: vec![],
1167            identifier: Some("mit-test".to_string()),
1168            file_regions: Vec::new(),
1169        }
1170    }
1171
1172    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1173        let dictionary = TokenDictionary::new_with_legalese(entries);
1174        let mut index = LicenseIndex::new(dictionary);
1175        index.len_legalese = len_legalese;
1176        index
1177    }
1178
1179    #[test]
1180    fn test_convert_detection_to_model_preserves_rule_url() {
1181        let detection = make_detection(
1182            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1183        );
1184
1185        let (converted, clues) =
1186            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1187        let converted = converted.expect("detection should convert");
1188
1189        assert_eq!(
1190            converted.matches[0].rule_url.as_deref(),
1191            Some(
1192                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1193            )
1194        );
1195        assert!(clues.is_empty());
1196    }
1197
1198    #[test]
1199    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1200        let detection = make_detection("");
1201
1202        let (converted, clues) =
1203            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1204        let converted = converted.expect("detection should convert");
1205
1206        assert_eq!(converted.matches[0].rule_url, None);
1207        assert!(clues.is_empty());
1208    }
1209
1210    #[test]
1211    fn test_convert_detection_to_model_rounds_match_coverage() {
1212        let mut detection = make_detection("");
1213        detection.matches[0].score = 81.82;
1214        detection.matches[0].match_coverage = 33.334;
1215
1216        let (converted, clues) =
1217            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1218        let converted = converted.expect("detection should convert");
1219
1220        assert_eq!(converted.matches[0].score, 81.82);
1221        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1222        assert!(clues.is_empty());
1223    }
1224
1225    #[test]
1226    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1227        let mut detection = make_detection(
1228            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1229        );
1230        detection.license_expression = None;
1231        detection.license_expression_spdx = None;
1232        detection.identifier = None;
1233        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1234        detection.matches[0].license_expression_spdx =
1235            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1236        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1237        detection.matches[0].rule_kind = RuleKind::Clue;
1238
1239        let (converted, clues) = convert_detection_to_model(
1240            &detection,
1241            LicenseScanOptions {
1242                include_text: true,
1243                min_score: 0,
1244                ..LicenseScanOptions::default()
1245            },
1246            "clue text",
1247            None,
1248        );
1249
1250        assert!(converted.is_none());
1251        assert_eq!(clues.len(), 1);
1252        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1253        assert_eq!(
1254            clues[0].license_expression_spdx,
1255            "LicenseRef-scancode-unknown-license-reference"
1256        );
1257        assert_eq!(
1258            clues[0].rule_identifier.as_deref(),
1259            Some("license-clue_1.RULE")
1260        );
1261        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1262        assert_eq!(clues[0].matched_text_diagnostics, None);
1263    }
1264
1265    #[test]
1266    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1267        let text = concat!(
1268            "Reproduction and distribution of this file, with or without modification, are\n",
1269            "permitted in any medium without royalties provided the copyright notice\n",
1270            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1271        );
1272        let index = create_test_index(
1273            &[
1274                ("reproduction", 0),
1275                ("distribution", 1),
1276                ("file", 2),
1277                ("without", 3),
1278                ("modification", 4),
1279                ("permitted", 5),
1280                ("medium", 6),
1281                ("royalties", 7),
1282                ("provided", 8),
1283                ("copyright", 9),
1284                ("notice", 10),
1285                ("preserved", 11),
1286                ("offered", 12),
1287                ("warranties", 13),
1288            ],
1289            14,
1290        );
1291        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1292        let mut detection = make_detection(
1293            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1294        );
1295        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1296        detection.matches[0].license_expression = "fsf-ap".to_string();
1297        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1298        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1299        detection.matches[0].matched_text = None;
1300        detection.matches[0].start_line = 1;
1301        detection.matches[0].end_line = 3;
1302        detection.matches[0].start_token = 0;
1303        detection.matches[0].end_token = query.tokens.len();
1304        detection.matches[0].qspan_positions = Some(
1305            query
1306                .tokens
1307                .iter()
1308                .enumerate()
1309                .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1310                .collect(),
1311        );
1312        detection.identifier = Some("fsf_ap-test".to_string());
1313
1314        let (converted, clues) = convert_detection_to_model(
1315            &detection,
1316            LicenseScanOptions {
1317                include_text: true,
1318                include_text_diagnostics: true,
1319                include_diagnostics: true,
1320                unknown_licenses: false,
1321                min_score: 0,
1322            },
1323            text,
1324            Some(&query),
1325        );
1326        let converted = converted.expect("detection should convert");
1327
1328        assert!(clues.is_empty());
1329        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1330        assert_eq!(
1331            converted.matches[0].matched_text.as_deref(),
1332            Some(text.trim_end())
1333        );
1334        let diagnostics = converted.matches[0]
1335            .matched_text_diagnostics
1336            .as_deref()
1337            .expect("diagnostics should be present");
1338        assert!(diagnostics.contains('['));
1339        assert!(diagnostics.contains(']'));
1340        assert_ne!(diagnostics, text.trim_end());
1341    }
1342
1343    #[test]
1344    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1345        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1346        let text = "alpha MIT omega";
1347        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1348        let mut detection = make_detection("");
1349        detection.matches[0].qspan_positions = Some(vec![1]);
1350        detection.matches[0].start_token = 1;
1351        detection.matches[0].end_token = 2;
1352
1353        let percentage = compute_percentage_of_license_text(&query, &[detection]);
1354
1355        assert_eq!(percentage, 33.33);
1356    }
1357
1358    #[test]
1359    fn test_scan_cache_fingerprint_changes_with_license_score() {
1360        let text_options = crate::scanner::TextDetectionOptions::default();
1361        let default_fingerprint = scan_cache_fingerprint(
1362            &text_options,
1363            LicenseScanOptions {
1364                min_score: 0,
1365                ..LicenseScanOptions::default()
1366            },
1367            true,
1368        );
1369        let filtered_fingerprint = scan_cache_fingerprint(
1370            &text_options,
1371            LicenseScanOptions {
1372                min_score: 70,
1373                ..LicenseScanOptions::default()
1374            },
1375            true,
1376        );
1377
1378        assert_ne!(default_fingerprint, filtered_fingerprint);
1379    }
1380
1381    #[test]
1382    fn test_is_go_non_production_source_for_test_filename() {
1383        let temp_dir = tempdir().unwrap();
1384        let path = temp_dir.path().join("scanner_test.go");
1385        fs::write(&path, "package scanner\n").unwrap();
1386
1387        assert!(is_go_non_production_source(&path).unwrap());
1388    }
1389
1390    #[test]
1391    fn test_is_go_non_production_source_for_build_tag() {
1392        let temp_dir = tempdir().unwrap();
1393        let path = temp_dir.path().join("scanner.go");
1394        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1395
1396        assert!(is_go_non_production_source(&path).unwrap());
1397    }
1398
1399    #[test]
1400    fn test_is_go_non_production_source_for_regular_go_file() {
1401        let temp_dir = tempdir().unwrap();
1402        let path = temp_dir.path().join("scanner.go");
1403        fs::write(&path, "package scanner\n").unwrap();
1404
1405        assert!(!is_go_non_production_source(&path).unwrap());
1406    }
1407}