provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3    is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29    Match, OutputEmail, OutputURL,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36    ExtractedTextKind, augment_license_detection_text, classify_file_info,
37    extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44    (
45        "-----BEGIN TRUSTED CERTIFICATE-----",
46        "-----END TRUSTED CERTIFICATE-----",
47    ),
48];
49
50pub fn process_collected(
51    collected: &CollectedPaths,
52    progress: Arc<ScanProgress>,
53    license_engine: Option<Arc<LicenseDetectionEngine>>,
54    license_options: LicenseScanOptions,
55    text_options: &TextDetectionOptions,
56) -> ProcessResult {
57    let mut all_files: Vec<FileInfo> = collected
58        .files
59        .par_iter()
60        .map(|(path, metadata)| {
61            let file_entry = process_file(
62                path,
63                metadata,
64                progress.as_ref(),
65                license_engine.clone(),
66                license_options,
67                text_options,
68            );
69            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70            file_entry
71        })
72        .collect();
73
74    for (path, metadata) in &collected.directories {
75        all_files.push(process_directory(
76            path,
77            metadata,
78            text_options.collect_info,
79            license_engine.is_some(),
80        ));
81    }
82
83    ProcessResult {
84        files: all_files,
85        excluded_count: collected.excluded_count,
86    }
87}
88
89pub fn process_collected_with_memory_limit(
90    collected: &CollectedPaths,
91    progress: Arc<ScanProgress>,
92    license_engine: Option<Arc<LicenseDetectionEngine>>,
93    license_options: LicenseScanOptions,
94    text_options: &TextDetectionOptions,
95    max_in_memory: i64,
96) -> ProcessResult {
97    if max_in_memory == 0 {
98        return process_collected(
99            collected,
100            progress,
101            license_engine,
102            license_options,
103            text_options,
104        );
105    }
106
107    let memory_limit = if max_in_memory < 0 {
108        0
109    } else {
110        max_in_memory as usize
111    };
112    let chunk_size = if max_in_memory < 0 {
113        256
114    } else {
115        memory_limit.max(1)
116    };
117
118    let mut retained_files = Vec::new();
119    let mut spill_store = None;
120
121    for chunk in collected.files.chunks(chunk_size) {
122        let processed_chunk: Vec<FileInfo> = chunk
123            .par_iter()
124            .map(|(path, metadata)| {
125                let file_entry = process_file(
126                    path,
127                    metadata,
128                    progress.as_ref(),
129                    license_engine.clone(),
130                    license_options,
131                    text_options,
132                );
133                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
134                file_entry
135            })
136            .collect();
137
138        retain_or_spill_chunk(
139            processed_chunk,
140            &mut retained_files,
141            &mut spill_store,
142            memory_limit,
143        );
144    }
145
146    for (path, metadata) in &collected.directories {
147        let entry = process_directory(
148            path,
149            metadata,
150            text_options.collect_info,
151            license_engine.is_some(),
152        );
153        retain_or_spill_chunk(
154            vec![entry],
155            &mut retained_files,
156            &mut spill_store,
157            memory_limit,
158        );
159    }
160
161    if let Some(spill_store) = spill_store {
162        retained_files.extend(spill_store.load_all());
163    }
164
165    ProcessResult {
166        files: retained_files,
167        excluded_count: collected.excluded_count,
168    }
169}
170
171fn retain_or_spill_chunk(
172    chunk: Vec<FileInfo>,
173    retained_files: &mut Vec<FileInfo>,
174    spill_store: &mut Option<FileInfoSpillStore>,
175    memory_limit: usize,
176) {
177    if memory_limit == 0 {
178        spill_store
179            .get_or_insert_with(FileInfoSpillStore::new)
180            .spill(chunk);
181        return;
182    }
183
184    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
185    if remaining_capacity >= chunk.len() && spill_store.is_none() {
186        retained_files.extend(chunk);
187        return;
188    }
189
190    let mut chunk_iter = chunk.into_iter();
191    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
192    let overflow: Vec<FileInfo> = chunk_iter.collect();
193    if !overflow.is_empty() {
194        spill_store
195            .get_or_insert_with(FileInfoSpillStore::new)
196            .spill(overflow);
197    }
198}
199
200struct FileInfoSpillStore {
201    temp_dir: TempDir,
202    batch_index: usize,
203}
204
205impl FileInfoSpillStore {
206    fn new() -> Self {
207        Self {
208            temp_dir: TempDir::new().expect("create spill dir"),
209            batch_index: 0,
210        }
211    }
212
213    fn spill(&mut self, files: Vec<FileInfo>) {
214        let path = self
215            .temp_dir
216            .path()
217            .join(format!("batch-{:06}.json.zst", self.batch_index));
218        self.batch_index += 1;
219
220        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
221        let file = File::create(path).expect("create spill batch file");
222        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
223        encoder
224            .write_all(&payload)
225            .expect("write spilled file batch");
226        encoder.finish().expect("finish spill encoder");
227    }
228
229    fn load_all(self) -> Vec<FileInfo> {
230        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
231            .expect("read spill dir")
232            .filter_map(Result::ok)
233            .map(|entry| entry.path())
234            .collect();
235        paths.sort();
236
237        let mut files = Vec::new();
238        for path in paths {
239            let file = File::open(path).expect("open spill batch");
240            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
241            let mut payload = Vec::new();
242            decoder.read_to_end(&mut payload).expect("read spill batch");
243            let mut batch: Vec<FileInfo> =
244                serde_json::from_slice(&payload).expect("decode spilled file batch");
245            files.append(&mut batch);
246        }
247        files
248    }
249}
250
251fn process_file(
252    path: &Path,
253    metadata: &fs::Metadata,
254    progress: &ScanProgress,
255    license_engine: Option<Arc<LicenseDetectionEngine>>,
256    license_options: LicenseScanOptions,
257    text_options: &TextDetectionOptions,
258) -> FileInfo {
259    let mut scan_errors: Vec<String> = vec![];
260    let mut file_info_builder = FileInfoBuilder::default();
261    let license_enabled = license_engine.is_some();
262
263    let started = Instant::now();
264
265    let mut generated_flag = None;
266    let mut is_source_file = false;
267    match extract_information_from_content(
268        &mut file_info_builder,
269        &mut scan_errors,
270        path,
271        progress,
272        license_engine,
273        license_options,
274        text_options,
275    ) {
276        Ok((is_generated, sha256, is_source)) => {
277            generated_flag = is_generated;
278            is_source_file = is_source;
279            let _ = sha256;
280        }
281        Err(e) => scan_errors.push(e.to_string()),
282    };
283
284    maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
285
286    let mut file_info = file_info_builder
287        .name(path.file_name().unwrap().to_string_lossy().to_string())
288        .base_name(
289            path.file_stem()
290                .unwrap_or_default()
291                .to_string_lossy()
292                .to_string(),
293        )
294        .extension(
295            path.extension()
296                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
297        )
298        .path(path.to_string_lossy().to_string())
299        .file_type(FileType::File)
300        .size(metadata.len())
301        .date(
302            text_options
303                .collect_info
304                .then(|| get_creation_date(metadata))
305                .flatten(),
306        )
307        .scan_errors(scan_errors)
308        .build()
309        .expect("FileInformationBuild not completely initialized");
310
311    if text_options.collect_info {
312        file_info.is_source = Some(is_source_file);
313    }
314
315    if file_info.programming_language.as_deref() == Some("Go")
316        && is_go_non_production_source(path).unwrap_or(false)
317    {
318        file_info.is_source = Some(false);
319    }
320
321    if text_options.detect_generated {
322        file_info.is_generated = Some(generated_flag.unwrap_or(false));
323    }
324
325    if file_info.percentage_of_license_text.is_none() && license_enabled {
326        file_info.percentage_of_license_text = Some(0.0);
327    }
328
329    file_info
330}
331
332fn extract_information_from_content(
333    file_info_builder: &mut FileInfoBuilder,
334    scan_errors: &mut Vec<String>,
335    path: &Path,
336    progress: &ScanProgress,
337    license_engine: Option<Arc<LicenseDetectionEngine>>,
338    license_options: LicenseScanOptions,
339    text_options: &TextDetectionOptions,
340) -> Result<(Option<bool>, String, bool), Error> {
341    let started = Instant::now();
342    let buffer = fs::read(path)?;
343    let license_enabled = license_engine.is_some();
344
345    if is_timeout_exceeded(started, text_options.timeout_seconds) {
346        return Err(Error::msg(format!(
347            "Timeout while reading file content (> {:.2}s)",
348            text_options.timeout_seconds
349        )));
350    }
351
352    let sha256 = calculate_sha256(&buffer);
353    let is_generated = text_options
354        .detect_generated
355        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
356    let classification = classify_file_info(path, &buffer);
357
358    if text_options.collect_info {
359        file_info_builder
360            .sha1(Some(calculate_sha1(&buffer)))
361            .md5(Some(calculate_md5(&buffer)))
362            .sha256(Some(sha256.clone()))
363            .programming_language(classification.programming_language.clone())
364            .mime_type(Some(classification.mime_type.clone()))
365            .file_type_label(Some(classification.file_type.clone()))
366            .sha1_git(Some(calculate_sha1_git(&buffer)))
367            .is_binary(Some(classification.is_binary))
368            .is_text(Some(classification.is_text))
369            .is_archive(Some(classification.is_archive))
370            .is_media(Some(classification.is_media))
371            .is_source(Some(classification.is_source))
372            .is_script(Some(classification.is_script))
373            .files_count(Some(0))
374            .dirs_count(Some(0))
375            .size_count(Some(0));
376    }
377
378    if should_skip_text_detection(path, &buffer) {
379        return Ok((is_generated, sha256, classification.is_source));
380    }
381
382    // Package parsing and text-based detection (copyright, license) are independent.
383    // Python ScanCode runs all enabled plugins on every file, so we do the same.
384    if text_options.detect_packages {
385        let started = Instant::now();
386        let parse_result = try_parse_file(path)
387            .or_else(|| {
388                text_options
389                    .detect_application_packages
390                    .then(|| try_parse_windows_executable_bytes(path, &buffer))
391                    .flatten()
392            })
393            .or_else(|| {
394                text_options
395                    .detect_packages_in_compiled
396                    .then(|| {
397                        (classification.is_binary && is_supported_compiled_binary_format(&buffer))
398                            .then(|| try_parse_compiled_bytes(&buffer))
399                            .flatten()
400                    })
401                    .flatten()
402            });
403
404        if let Some(parse_result) = parse_result {
405            let packages = parse_result
406                .packages
407                .into_iter()
408                .filter(|package| {
409                    let is_compiled_package = package
410                        .datasource_id
411                        .as_ref()
412                        .is_some_and(is_compiled_datasource);
413                    let is_system_package = package
414                        .datasource_id
415                        .as_ref()
416                        .is_some_and(is_system_datasource);
417                    if is_compiled_package {
418                        text_options.detect_packages_in_compiled
419                    } else if is_system_package {
420                        text_options.detect_system_packages
421                    } else {
422                        text_options.detect_application_packages
423                    }
424                })
425                .collect();
426            file_info_builder.package_data(packages);
427            scan_errors.extend(parse_result.scan_errors);
428        }
429        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
430    }
431
432    if is_timeout_exceeded(started, text_options.timeout_seconds) {
433        return Err(Error::msg(format!(
434            "Timeout while extracting package/text metadata (> {:.2}s)",
435            text_options.timeout_seconds
436        )));
437    }
438
439    let (text_content, text_kind, text_scan_error) =
440        extract_text_for_detection_with_diagnostics(path, &buffer);
441    if let Some(text_scan_error) = text_scan_error {
442        scan_errors.push(text_scan_error);
443    }
444    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
445
446    if is_timeout_exceeded(started, text_options.timeout_seconds) {
447        return Err(Error::msg(format!(
448            "Timeout while extracting text content (> {:.2}s)",
449            text_options.timeout_seconds
450        )));
451    }
452
453    if text_content.is_empty() {
454        return Ok((is_generated, sha256, classification.is_source));
455    }
456
457    if text_options.detect_copyrights {
458        extract_copyright_information(
459            file_info_builder,
460            path,
461            &text_content,
462            text_options.timeout_seconds,
463            from_binary_strings,
464        );
465    }
466    extract_email_url_information(
467        file_info_builder,
468        &text_content,
469        text_options,
470        from_binary_strings,
471    );
472
473    if is_timeout_exceeded(started, text_options.timeout_seconds) {
474        return Err(Error::msg(format!(
475            "Timeout before license scan (> {:.2}s)",
476            text_options.timeout_seconds
477        )));
478    }
479    // Handle source map files specially
480    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
481        if let Some(sourcemap_content) =
482            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
483        {
484            sourcemap_content
485        } else {
486            text_content
487        }
488    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
489        remove_verbatim_escape_sequences(&text_content)
490    } else {
491        text_content
492    };
493    let text_content_for_license_detection =
494        augment_license_detection_text(path, &text_content_for_license_detection);
495    let text_content_for_license_detection = text_content_for_license_detection.into_owned();
496
497    if license_enabled {
498        let started = Instant::now();
499        extract_license_information(
500            file_info_builder,
501            scan_errors,
502            path,
503            text_content_for_license_detection.clone(),
504            license_engine,
505            license_options,
506            from_binary_strings,
507        )?;
508        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
509    } else {
510        extract_license_information(
511            file_info_builder,
512            scan_errors,
513            path,
514            text_content_for_license_detection,
515            license_engine,
516            license_options,
517            from_binary_strings,
518        )?;
519    }
520
521    Ok((is_generated, sha256, classification.is_source))
522}
523
524fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
525    timeout_seconds.is_finite()
526        && timeout_seconds > 0.0
527        && started.elapsed().as_secs_f64() > timeout_seconds
528}
529
530fn maybe_record_processing_timeout(
531    scan_errors: &mut Vec<String>,
532    started: Instant,
533    timeout_seconds: f64,
534) {
535    if is_timeout_exceeded(started, timeout_seconds)
536        && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
537    {
538        scan_errors.push(format!(
539            "Processing interrupted due to timeout after {:.2} seconds",
540            timeout_seconds
541        ));
542    }
543}
544
545fn is_timeout_scan_error(error: &str) -> bool {
546    error.contains("Timeout while ")
547        || error.contains("Timeout before ")
548        || error.contains("Processing interrupted due to timeout")
549}
550
551fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
552    matches!(
553        datasource_id,
554        DatasourceId::AlpineInstalledDb
555            | DatasourceId::DebianDistrolessInstalledDb
556            | DatasourceId::DebianInstalledFilesList
557            | DatasourceId::DebianInstalledMd5Sums
558            | DatasourceId::DebianInstalledStatusDb
559            | DatasourceId::FreebsdCompactManifest
560            | DatasourceId::RpmInstalledDatabaseBdb
561            | DatasourceId::RpmInstalledDatabaseNdb
562            | DatasourceId::RpmInstalledDatabaseSqlite
563            | DatasourceId::RpmYumdb
564    )
565}
566
567fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
568    matches!(
569        datasource_id,
570        DatasourceId::GoBinary | DatasourceId::RustBinary
571    )
572}
573
574fn extract_copyright_information(
575    file_info_builder: &mut FileInfoBuilder,
576    path: &Path,
577    text_content: &str,
578    timeout_seconds: f64,
579    from_binary_strings: bool,
580) {
581    // CREDITS files get special handling (Linux kernel style).
582    if copyright::is_credits_file(path) {
583        let author_detections = copyright::detect_credits_authors(text_content);
584        if !author_detections.is_empty() {
585            file_info_builder.authors(
586                author_detections
587                    .into_iter()
588                    .map(|a| Author {
589                        author: a.author,
590                        start_line: a.start_line,
591                        end_line: a.end_line,
592                    })
593                    .collect(),
594            );
595            return;
596        }
597    }
598
599    let copyright_options = CopyrightDetectionOptions {
600        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
601            Some(Duration::from_secs_f64(timeout_seconds))
602        } else {
603            None
604        },
605        ..CopyrightDetectionOptions::default()
606    };
607
608    let (copyrights, holders, authors) =
609        copyright::detect_copyrights_with_options(text_content, &copyright_options);
610    let (copyrights, holders, authors) = if from_binary_strings {
611        prune_binary_string_detections(text_content, copyrights, holders, authors)
612    } else {
613        (copyrights, holders, authors)
614    };
615
616    file_info_builder.copyrights(
617        copyrights
618            .into_iter()
619            .map(|c| Copyright {
620                copyright: c.copyright,
621                start_line: c.start_line,
622                end_line: c.end_line,
623            })
624            .collect::<Vec<Copyright>>(),
625    );
626    file_info_builder.holders(
627        holders
628            .into_iter()
629            .map(|h| Holder {
630                holder: h.holder,
631                start_line: h.start_line,
632                end_line: h.end_line,
633            })
634            .collect::<Vec<Holder>>(),
635    );
636    file_info_builder.authors(
637        authors
638            .into_iter()
639            .map(|a| Author {
640                author: a.author,
641                start_line: a.start_line,
642                end_line: a.end_line,
643            })
644            .collect::<Vec<Author>>(),
645    );
646}
647
648fn prune_binary_string_detections(
649    text_content: &str,
650    copyrights: Vec<CopyrightDetection>,
651    holders: Vec<HolderDetection>,
652    authors: Vec<AuthorDetection>,
653) -> (
654    Vec<CopyrightDetection>,
655    Vec<HolderDetection>,
656    Vec<AuthorDetection>,
657) {
658    let kept_copyrights: Vec<CopyrightDetection> = copyrights
659        .into_iter()
660        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
661        .collect();
662
663    let kept_holders: Vec<HolderDetection> = holders
664        .into_iter()
665        .filter(|holder| {
666            kept_copyrights.iter().any(|copyright| {
667                ranges_overlap(
668                    holder.start_line,
669                    holder.end_line,
670                    copyright.start_line,
671                    copyright.end_line,
672                )
673            })
674        })
675        .collect();
676
677    let kept_authors = authors
678        .into_iter()
679        .filter(|author| is_binary_string_author_candidate(&author.author))
680        .chain(extract_binary_string_author_supplements(text_content))
681        .filter({
682            let mut seen = HashSet::new();
683            move |author| seen.insert(author.author.clone())
684        })
685        .collect();
686
687    (kept_copyrights, kept_holders, kept_authors)
688}
689
690fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
691    a_start <= b_end && b_start <= a_end
692}
693
694fn is_binary_string_copyright_candidate(text: &str) -> bool {
695    if contains_year(text) {
696        return true;
697    }
698
699    let trimmed = text.trim();
700    let lower = trimmed.to_ascii_lowercase();
701    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
702        tail.trim()
703    } else {
704        lower.trim()
705    };
706    let original_tail = if lower.starts_with("copyright") {
707        trimmed["copyright".len()..].trim()
708    } else {
709        trimmed
710    };
711
712    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
713        return false;
714    }
715
716    let alpha_tokens: Vec<&str> = tail
717        .split_whitespace()
718        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
719        .collect();
720
721    if alpha_tokens.len() <= 1 {
722        return has_explicit_copyright_marker(text)
723            && alpha_tokens.iter().any(|token| {
724                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
725            });
726    }
727
728    if !has_explicit_copyright_marker(text) {
729        return false;
730    }
731
732    has_binary_name_like_shape(original_tail)
733}
734
735fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
736    let mut authors = Vec::new();
737
738    for (line_index, line) in text_content.lines().enumerate() {
739        if let Some(author) = extract_named_author_from_binary_line(line) {
740            authors.push(AuthorDetection {
741                author,
742                start_line: line_index + 1,
743                end_line: line_index + 1,
744            });
745        }
746    }
747
748    authors
749}
750
751fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
752    let line = line.trim();
753    if line.is_empty() {
754        return None;
755    }
756
757    let emails = finder::find_emails(
758        line,
759        &DetectionConfig {
760            max_emails: 4,
761            max_urls: 0,
762            unique: false,
763        },
764    );
765    let email = emails.first()?.email.as_str();
766    if !is_binary_string_email_candidate(email) {
767        return None;
768    }
769
770    let lower_line = line.to_ascii_lowercase();
771    let email_start = lower_line.find(email)?;
772    let raw_prefix = &line[..email_start];
773    let has_author_marker = contains_binary_author_marker(raw_prefix);
774    let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
775    let prefix = prefix
776        .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
777        .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
778        .trim();
779
780    let (name, _) = split_name_email(prefix);
781    let name = name.or_else(|| {
782        let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
783        (!trimmed.is_empty()).then(|| trimmed.to_string())
784    });
785
786    let Some(name) = name.map(|name| name.trim().to_string()) else {
787        if has_author_marker {
788            return Some(email.to_string());
789        }
790        return None;
791    };
792
793    if name.is_empty() && has_author_marker {
794        return Some(email.to_string());
795    }
796
797    if !has_binary_name_like_shape(&name) {
798        return None;
799    }
800
801    if line.contains(&format!("<{email}>")) {
802        Some(format!("{name} <{email}>"))
803    } else if line.contains(&format!("({email})")) {
804        Some(format!("{name} ({email})"))
805    } else {
806        Some(format!("{name} {email}"))
807    }
808}
809
810fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
811    let lower = text.to_ascii_lowercase();
812    let idx = lower.rfind(marker)?;
813    Some(text[idx + marker.len()..].trim())
814}
815
816fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
817    const MARKERS: &[&str] = &[
818        " patch author: ",
819        " patch author ",
820        " written by ",
821        " contributed by ",
822        " original work done by ",
823        " work done by ",
824        " thanks to ",
825        " review by ",
826        " by ",
827        " from ",
828    ];
829
830    MARKERS
831        .iter()
832        .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
833        .next()
834}
835
836fn contains_binary_author_marker(text: &str) -> bool {
837    take_suffix_after_last_author_marker(text).is_some()
838}
839
840fn has_binary_name_like_shape(text: &str) -> bool {
841    let trimmed = text.trim();
842    if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
843    {
844        return false;
845    }
846
847    let tokens: Vec<&str> = trimmed
848        .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
849        .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
850        .collect();
851    if tokens.is_empty() {
852        return false;
853    }
854
855    let uppercase_like = tokens
856        .iter()
857        .filter(|token| {
858            let token = token.trim_matches('.');
859            token
860                .chars()
861                .find(|c| c.is_ascii_alphabetic())
862                .is_some_and(|c| c.is_ascii_uppercase())
863        })
864        .count();
865
866    uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
867        || tokens
868            .iter()
869            .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
870}
871
872fn has_sufficient_alphabetic_content(text: &str) -> bool {
873    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
874    if alnum_count == 0 {
875        return false;
876    }
877
878    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
879    alpha_count * 2 >= alnum_count
880}
881
882fn has_excessive_at_noise(text: &str) -> bool {
883    text.chars().filter(|c| *c == '@').count() >= 3
884}
885
886fn has_explicit_copyright_marker(text: &str) -> bool {
887    let lower = text.to_ascii_lowercase();
888    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
889}
890
891fn contains_year(text: &str) -> bool {
892    let bytes = text.as_bytes();
893    bytes.windows(4).any(|window| {
894        window.iter().all(|b| b.is_ascii_digit())
895            && matches!(window[0], b'1' | b'2')
896            && matches!(window[1], b'9' | b'0')
897    })
898}
899
900fn is_company_like_suffix(token: &str) -> bool {
901    matches!(
902        token.to_ascii_lowercase().as_str(),
903        "inc"
904            | "corp"
905            | "corporation"
906            | "co"
907            | "company"
908            | "ltd"
909            | "llc"
910            | "gmbh"
911            | "foundation"
912            | "project"
913            | "systems"
914            | "software"
915            | "technologies"
916            | "technology"
917    )
918}
919
920fn extract_email_url_information(
921    file_info_builder: &mut FileInfoBuilder,
922    text_content: &str,
923    text_options: &TextDetectionOptions,
924    from_binary_strings: bool,
925) {
926    if !text_options.detect_emails && !text_options.detect_urls {
927        return;
928    }
929
930    if text_options.detect_emails {
931        let config = DetectionConfig {
932            max_emails: text_options.max_emails,
933            max_urls: text_options.max_urls,
934            unique: from_binary_strings,
935        };
936        let emails = finder::find_emails(text_content, &config)
937            .into_iter()
938            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
939            .map(|d| OutputEmail {
940                email: d.email,
941                start_line: d.start_line,
942                end_line: d.end_line,
943            })
944            .collect::<Vec<_>>();
945        file_info_builder.emails(emails);
946    }
947
948    if text_options.detect_urls {
949        let config = DetectionConfig {
950            max_emails: text_options.max_emails,
951            max_urls: text_options.max_urls,
952            unique: true,
953        };
954        let urls = finder::find_urls(text_content, &config)
955            .into_iter()
956            .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
957            .map(|d| OutputURL {
958                url: d.url,
959                start_line: d.start_line,
960                end_line: d.end_line,
961            })
962            .collect::<Vec<_>>();
963        file_info_builder.urls(urls);
964    }
965}
966
967fn is_binary_string_email_candidate(email: &str) -> bool {
968    let Some((local, domain)) = email.rsplit_once('@') else {
969        return false;
970    };
971
972    if !has_strong_binary_local_part(local) {
973        return false;
974    }
975
976    has_strong_binary_host_shape(domain)
977}
978
979fn is_binary_string_url_candidate(url: &str) -> bool {
980    let parsed = url::Url::parse(url).ok();
981    let Some(parsed) = parsed else {
982        return false;
983    };
984    let Some(host) = parsed.host_str() else {
985        return false;
986    };
987
988    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
989}
990
991fn is_binary_string_author_candidate(author: &str) -> bool {
992    let trimmed = author.trim();
993    if trimmed.is_empty()
994        || !has_sufficient_alphabetic_content(trimmed)
995        || has_excessive_at_noise(trimmed)
996    {
997        return false;
998    }
999
1000    if trimmed.contains('@') {
1001        let emails = finder::find_emails(
1002            trimmed,
1003            &DetectionConfig {
1004                max_emails: 4,
1005                max_urls: 0,
1006                unique: true,
1007            },
1008        );
1009        if emails.len() > 1 {
1010            return false;
1011        }
1012
1013        if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1014            return !extracted.is_empty();
1015        }
1016
1017        let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1018            return false;
1019        };
1020        if !is_binary_string_email_candidate(email) {
1021            return false;
1022        }
1023
1024        let (name, _) = split_name_email(trimmed);
1025        return name.as_deref().is_some_and(has_binary_name_like_shape);
1026    }
1027
1028    has_binary_name_like_shape(trimmed)
1029}
1030
1031fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1032    if parsed.path() != "/"
1033        && parsed
1034            .path()
1035            .split('/')
1036            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1037    {
1038        return true;
1039    }
1040
1041    if parsed.query().is_some() || parsed.fragment().is_some() {
1042        return true;
1043    }
1044
1045    let Some(host) = parsed.host_str() else {
1046        return false;
1047    };
1048
1049    let labels: Vec<&str> = host.split('.').collect();
1050    if labels.len() > 2 {
1051        return labels[..labels.len() - 1].iter().any(|label| {
1052            label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1053        });
1054    }
1055
1056    if matches!(labels.first(), Some(&"www")) {
1057        return true;
1058    }
1059
1060    if labels.len() == 2 {
1061        let domain = labels[0];
1062        let tld = labels[1];
1063        if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1064            return true;
1065        }
1066    }
1067
1068    labels
1069        .iter()
1070        .take(labels.len().saturating_sub(1))
1071        .any(|label| {
1072            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1073        })
1074}
1075
1076fn has_strong_binary_local_part(local: &str) -> bool {
1077    local
1078        .split(|c: char| !c.is_ascii_alphabetic())
1079        .any(|segment| segment.len() >= 3)
1080}
1081
1082fn has_strong_binary_host_shape(host: &str) -> bool {
1083    let labels: Vec<&str> = host.split('.').collect();
1084    if labels.len() < 2 {
1085        return false;
1086    }
1087
1088    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1089        &labels[1..]
1090    } else {
1091        &labels[..]
1092    };
1093
1094    if relevant.len() < 2 {
1095        return false;
1096    }
1097
1098    relevant[..relevant.len() - 1].iter().any(|label| {
1099        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1100    })
1101}
1102
1103fn extract_license_information(
1104    file_info_builder: &mut FileInfoBuilder,
1105    scan_errors: &mut Vec<String>,
1106    path: &Path,
1107    text_content: String,
1108    license_engine: Option<Arc<LicenseDetectionEngine>>,
1109    license_options: LicenseScanOptions,
1110    from_binary_strings: bool,
1111) -> Result<(), Error> {
1112    let Some(engine) = license_engine else {
1113        return Ok(());
1114    };
1115
1116    let detection_result = if license_options.min_score == 0 {
1117        engine.detect_with_kind_and_source(
1118            &text_content,
1119            license_options.unknown_licenses,
1120            from_binary_strings,
1121            &path.to_string_lossy(),
1122        )
1123    } else {
1124        engine.detect_with_kind_and_source_with_score(
1125            &text_content,
1126            license_options.unknown_licenses,
1127            from_binary_strings,
1128            &path.to_string_lossy(),
1129            license_options.min_score as f32,
1130        )
1131    };
1132
1133    match detection_result {
1134        Ok(detections) => {
1135            let query =
1136                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1137            let mut model_detections = Vec::new();
1138            let mut model_clues = Vec::new();
1139
1140            for detection in &detections {
1141                let (public_detection, clue_matches) = convert_detection_to_model(
1142                    detection,
1143                    license_options,
1144                    &text_content,
1145                    query.as_ref(),
1146                );
1147
1148                if let Some(public_detection) = public_detection {
1149                    model_detections.push(public_detection);
1150                }
1151
1152                model_clues.extend(clue_matches);
1153            }
1154
1155            if !model_detections.is_empty() {
1156                let expressions: Vec<String> = model_detections
1157                    .iter()
1158                    .filter(|d| !d.license_expression_spdx.is_empty())
1159                    .map(|d| d.license_expression_spdx.clone())
1160                    .collect();
1161
1162                if !expressions.is_empty() {
1163                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
1164                    if let Some(expr) = combined {
1165                        file_info_builder.license_expression(Some(expr));
1166                    }
1167                }
1168            }
1169
1170            file_info_builder.license_detections(model_detections);
1171            file_info_builder.license_clues(model_clues);
1172            file_info_builder.percentage_of_license_text(
1173                query
1174                    .as_ref()
1175                    .map(|query| compute_percentage_of_license_text(query, &detections)),
1176            );
1177        }
1178        Err(e) => {
1179            scan_errors.push(format!("License detection failed: {}", e));
1180        }
1181    }
1182
1183    Ok(())
1184}
1185
1186fn convert_detection_to_model(
1187    detection: &crate::license_detection::LicenseDetection,
1188    license_options: LicenseScanOptions,
1189    text_content: &str,
1190    query: Option<&Query<'_>>,
1191) -> (Option<LicenseDetection>, Vec<Match>) {
1192    let matches: Vec<Match> = detection
1193        .matches
1194        .iter()
1195        .map(|m| convert_match_to_model(m, license_options, text_content, query))
1196        .collect();
1197
1198    if let Some(license_expression) = detection.license_expression.clone() {
1199        (
1200            Some(LicenseDetection {
1201                license_expression,
1202                license_expression_spdx: detection
1203                    .license_expression_spdx
1204                    .clone()
1205                    .unwrap_or_default(),
1206                matches,
1207                detection_log: if license_options.include_diagnostics {
1208                    detection.detection_log.clone()
1209                } else {
1210                    Vec::new()
1211                },
1212                identifier: detection.identifier.clone(),
1213            }),
1214            Vec::new(),
1215        )
1216    } else {
1217        (None, matches)
1218    }
1219}
1220
1221fn convert_match_to_model(
1222    m: &crate::license_detection::models::LicenseMatch,
1223    license_options: LicenseScanOptions,
1224    text_content: &str,
1225    query: Option<&Query<'_>>,
1226) -> Match {
1227    let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
1228    let rule_url = if m.rule_url.is_empty() {
1229        None
1230    } else {
1231        Some(m.rule_url.clone())
1232    };
1233    let matched_text = if license_options.include_text {
1234        m.matched_text.clone().or_else(|| {
1235            Some(crate::license_detection::query::matched_text_from_text(
1236                text_content,
1237                m.start_line,
1238                m.end_line,
1239            ))
1240        })
1241    } else {
1242        None
1243    };
1244    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1245        query.map(|query| matched_text_diagnostics_from_match(query, m))
1246    } else {
1247        None
1248    };
1249    Match {
1250        license_expression: m.license_expression.clone(),
1251        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1252        from_file: m.from_file.clone(),
1253        start_line: m.start_line,
1254        end_line: m.end_line,
1255        matcher: Some(m.matcher.to_string()),
1256        score: output_metric(m.score),
1257        matched_length: Some(m.matched_length),
1258        match_coverage: Some(output_metric(m.coverage())),
1259        rule_relevance: Some(m.rule_relevance as usize),
1260        rule_identifier: Some(m.rule_identifier.clone()),
1261        rule_url,
1262        matched_text,
1263        referenced_filenames: m.referenced_filenames.clone(),
1264        matched_text_diagnostics,
1265    }
1266}
1267
1268fn compute_percentage_of_license_text(
1269    query: &Query<'_>,
1270    detections: &[crate::license_detection::LicenseDetection],
1271) -> f64 {
1272    let matched_positions: std::collections::HashSet<usize> = detections
1273        .iter()
1274        .flat_map(|detection| detection.matches.iter())
1275        .flat_map(|m| m.query_span().iter())
1276        .collect();
1277
1278    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1279    if query_tokens_length == 0 {
1280        return 0.0;
1281    }
1282
1283    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1284    (percentage * 100.0).round() / 100.0
1285}
1286
1287fn matched_text_diagnostics_from_match(
1288    query: &Query<'_>,
1289    license_match: &InternalLicenseMatch,
1290) -> String {
1291    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1292    let Some(start_pos) = matched_positions.iter().min() else {
1293        return crate::license_detection::query::matched_text_from_text(
1294            &query.text,
1295            license_match.start_line,
1296            license_match.end_line,
1297        );
1298    };
1299    let Some(end_pos) = matched_positions.iter().max() else {
1300        return crate::license_detection::query::matched_text_from_text(
1301            &query.text,
1302            license_match.start_line,
1303            license_match.end_line,
1304        );
1305    };
1306
1307    crate::license_detection::query::matched_text_diagnostics_from_text(
1308        &query.text,
1309        query,
1310        &matched_positions,
1311        start_pos,
1312        end_pos,
1313        license_match.start_line,
1314        license_match.end_line,
1315    )
1316}
1317
1318fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1319    is_pem_certificate_file(path, buffer)
1320}
1321
1322fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1323    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1324        return Ok(false);
1325    }
1326
1327    if path
1328        .file_name()
1329        .and_then(|name| name.to_str())
1330        .is_some_and(|name| name.ends_with("_test.go"))
1331    {
1332        return Ok(true);
1333    }
1334
1335    let content = fs::read_to_string(path)?;
1336    Ok(content.lines().take(10).any(|line| {
1337        let trimmed = line.trim();
1338        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1339            && trimmed.split_whitespace().any(|token| token == "test")
1340    }))
1341}
1342
1343fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1344    let prefix_len = buffer.len().min(8192);
1345    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1346    let trimmed_lines: Vec<&str> = prefix
1347        .lines()
1348        .map(str::trim)
1349        .filter(|line| !line.is_empty())
1350        .take(64)
1351        .collect();
1352
1353    let Some(first_line) = trimmed_lines.first().copied() else {
1354        return false;
1355    };
1356
1357    PEM_CERTIFICATE_HEADERS
1358        .iter()
1359        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1360}
1361
1362fn process_directory(
1363    path: &Path,
1364    _metadata: &fs::Metadata,
1365    collect_info: bool,
1366    license_enabled: bool,
1367) -> FileInfo {
1368    let name = path
1369        .file_name()
1370        .unwrap_or_default()
1371        .to_string_lossy()
1372        .to_string();
1373    let base_name = name.clone(); // For directories, base_name is the same as name
1374
1375    FileInfo {
1376        name,
1377        base_name,
1378        extension: "".to_string(),
1379        path: path.to_string_lossy().to_string(),
1380        file_type: FileType::Directory,
1381        mime_type: None,
1382        file_type_label: None,
1383        size: 0,
1384        date: None,
1385        sha1: None,
1386        md5: None,
1387        sha256: None,
1388        sha1_git: None,
1389        programming_language: None,
1390        package_data: Vec::new(),
1391        license_expression: None,
1392        license_detections: Vec::new(),
1393        license_clues: Vec::new(),
1394        percentage_of_license_text: license_enabled.then_some(0.0),
1395        copyrights: Vec::new(),
1396        holders: Vec::new(),
1397        authors: Vec::new(),
1398        emails: Vec::new(),
1399        urls: Vec::new(),
1400        for_packages: Vec::new(),
1401        scan_errors: Vec::new(),
1402        license_policy: None,
1403        is_binary: collect_info.then_some(false),
1404        is_text: collect_info.then_some(false),
1405        is_archive: collect_info.then_some(false),
1406        is_media: collect_info.then_some(false),
1407        is_source: collect_info.then_some(false),
1408        is_script: collect_info.then_some(false),
1409        files_count: collect_info.then_some(0),
1410        dirs_count: collect_info.then_some(0),
1411        size_count: collect_info.then_some(0),
1412        source_count: None,
1413        is_legal: false,
1414        is_manifest: false,
1415        is_readme: false,
1416        is_top_level: false,
1417        is_key_file: false,
1418        is_community: false,
1419        is_generated: None,
1420        facets: vec![],
1421        tallies: None,
1422    }
1423}
1424
1425#[cfg(test)]
1426mod tests {
1427    use super::{
1428        compute_percentage_of_license_text, convert_detection_to_model,
1429        extract_email_url_information, extract_named_author_from_binary_line,
1430        is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1431        is_binary_string_email_candidate, is_binary_string_url_candidate,
1432        is_go_non_production_source, process_file,
1433    };
1434    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1435    use crate::license_detection::index::LicenseIndex;
1436    use crate::license_detection::index::dictionary::TokenDictionary;
1437    use crate::license_detection::models::position_span::PositionSpan;
1438    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1439    use crate::license_detection::query::Query;
1440    use crate::models::{FileInfoBuilder, FileType};
1441    use crate::progress::{ProgressMode, ScanProgress};
1442    use crate::scanner::scan_options_fingerprint;
1443    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1444    use std::fs;
1445    use std::time::{Duration, Instant};
1446    use tempfile::tempdir;
1447
1448    use super::maybe_record_processing_timeout;
1449
1450    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1451        LicenseMatch {
1452            rid: 0,
1453            license_expression: "mit".to_string(),
1454            license_expression_spdx: Some("MIT".to_string()),
1455            from_file: None,
1456            start_line: 1,
1457            end_line: 1,
1458            start_token: 0,
1459            end_token: 1,
1460            matcher: MatcherKind::Hash,
1461            score: 1.0,
1462            matched_length: 3,
1463            rule_length: 3,
1464            match_coverage: 100.0,
1465            rule_relevance: 100,
1466            rule_identifier: "mit.LICENSE".to_string(),
1467            rule_url: rule_url.to_string(),
1468            matched_text: Some("MIT".to_string()),
1469            referenced_filenames: None,
1470            rule_kind: RuleKind::Text,
1471            is_from_license: true,
1472            rule_start_token: 0,
1473            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1474            candidate_resemblance: 0.0,
1475            candidate_containment: 0.0,
1476        }
1477    }
1478
1479    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1480        InternalLicenseDetection {
1481            license_expression: Some("mit".to_string()),
1482            license_expression_spdx: Some("MIT".to_string()),
1483            matches: vec![make_internal_match(rule_url)],
1484            detection_log: vec![],
1485            identifier: Some("mit-test".to_string()),
1486            file_regions: Vec::new(),
1487        }
1488    }
1489
1490    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1491        let dictionary = TokenDictionary::new_with_legalese(entries);
1492        let mut index = LicenseIndex::new(dictionary);
1493        index.len_legalese = len_legalese;
1494        index
1495    }
1496
1497    #[test]
1498    fn test_convert_detection_to_model_preserves_rule_url() {
1499        let detection = make_detection(
1500            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1501        );
1502
1503        let (converted, clues) =
1504            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1505        let converted = converted.expect("detection should convert");
1506
1507        assert_eq!(
1508            converted.matches[0].rule_url.as_deref(),
1509            Some(
1510                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1511            )
1512        );
1513        assert!(clues.is_empty());
1514    }
1515
1516    #[test]
1517    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1518        let detection = make_detection("");
1519
1520        let (converted, clues) =
1521            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1522        let converted = converted.expect("detection should convert");
1523
1524        assert_eq!(converted.matches[0].rule_url, None);
1525        assert!(clues.is_empty());
1526    }
1527
1528    #[test]
1529    fn test_convert_detection_to_model_rounds_match_coverage() {
1530        let mut detection = make_detection("");
1531        detection.matches[0].score = 81.82;
1532        detection.matches[0].match_coverage = 33.334;
1533
1534        let (converted, clues) =
1535            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1536        let converted = converted.expect("detection should convert");
1537
1538        assert_eq!(converted.matches[0].score, 81.82);
1539        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1540        assert!(clues.is_empty());
1541    }
1542
1543    #[test]
1544    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1545        let mut detection = make_detection(
1546            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1547        );
1548        detection.license_expression = None;
1549        detection.license_expression_spdx = None;
1550        detection.identifier = None;
1551        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1552        detection.matches[0].license_expression_spdx =
1553            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1554        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1555        detection.matches[0].rule_kind = RuleKind::Clue;
1556
1557        let (converted, clues) = convert_detection_to_model(
1558            &detection,
1559            LicenseScanOptions {
1560                include_text: true,
1561                min_score: 0,
1562                ..LicenseScanOptions::default()
1563            },
1564            "clue text",
1565            None,
1566        );
1567
1568        assert!(converted.is_none());
1569        assert_eq!(clues.len(), 1);
1570        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1571        assert_eq!(
1572            clues[0].license_expression_spdx,
1573            "LicenseRef-scancode-unknown-license-reference"
1574        );
1575        assert_eq!(
1576            clues[0].rule_identifier.as_deref(),
1577            Some("license-clue_1.RULE")
1578        );
1579        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1580        assert_eq!(clues[0].matched_text_diagnostics, None);
1581    }
1582
1583    #[test]
1584    fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1585        let dir = tempdir().expect("tempdir");
1586        let path = dir.path().join("broken.pdf");
1587        fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1588            .expect("write malformed pdf");
1589        let metadata = fs::metadata(&path).expect("metadata");
1590        let progress = ScanProgress::new(ProgressMode::Quiet);
1591
1592        let file_info = process_file(
1593            &path,
1594            &metadata,
1595            &progress,
1596            None,
1597            LicenseScanOptions::default(),
1598            &TextDetectionOptions::default(),
1599        );
1600
1601        assert!(file_info.scan_errors.is_empty());
1602    }
1603
1604    #[test]
1605    fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1606        let started = Instant::now() - Duration::from_secs(2);
1607        let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1608
1609        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1610
1611        assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1612    }
1613
1614    #[test]
1615    fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1616        let started = Instant::now() - Duration::from_secs(2);
1617        let mut scan_errors = Vec::new();
1618
1619        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1620
1621        assert_eq!(
1622            scan_errors,
1623            vec!["Processing interrupted due to timeout after 1.00 seconds"]
1624        );
1625    }
1626
1627    #[test]
1628    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1629        let text = concat!(
1630            "Reproduction and distribution of this file, with or without modification, are\n",
1631            "permitted in any medium without royalties provided the copyright notice\n",
1632            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1633        );
1634        let index = create_test_index(
1635            &[
1636                ("reproduction", 0),
1637                ("distribution", 1),
1638                ("file", 2),
1639                ("without", 3),
1640                ("modification", 4),
1641                ("permitted", 5),
1642                ("medium", 6),
1643                ("royalties", 7),
1644                ("provided", 8),
1645                ("copyright", 9),
1646                ("notice", 10),
1647                ("preserved", 11),
1648                ("offered", 12),
1649                ("warranties", 13),
1650            ],
1651            14,
1652        );
1653        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1654        let mut detection = make_detection(
1655            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1656        );
1657        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1658        detection.matches[0].license_expression = "fsf-ap".to_string();
1659        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1660        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1661        detection.matches[0].matched_text = None;
1662        detection.matches[0].start_line = 1;
1663        detection.matches[0].end_line = 3;
1664        detection.matches[0].start_token = 0;
1665        detection.matches[0].end_token = query.tokens.len();
1666        detection.matches[0].coordinates =
1667            MatchCoordinates::query_region(PositionSpan::from_positions(
1668                query
1669                    .tokens
1670                    .iter()
1671                    .enumerate()
1672                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1673                    .collect::<Vec<_>>(),
1674            ));
1675        detection.identifier = Some("fsf_ap-test".to_string());
1676
1677        let (converted, clues) = convert_detection_to_model(
1678            &detection,
1679            LicenseScanOptions {
1680                include_text: true,
1681                include_text_diagnostics: true,
1682                include_diagnostics: true,
1683                unknown_licenses: false,
1684                min_score: 0,
1685            },
1686            text,
1687            Some(&query),
1688        );
1689        let converted = converted.expect("detection should convert");
1690
1691        assert!(clues.is_empty());
1692        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1693        assert_eq!(
1694            converted.matches[0].matched_text.as_deref(),
1695            Some(text.trim_end())
1696        );
1697        let diagnostics = converted.matches[0]
1698            .matched_text_diagnostics
1699            .as_deref()
1700            .expect("diagnostics should be present");
1701        assert!(diagnostics.contains('['));
1702        assert!(diagnostics.contains(']'));
1703        assert_ne!(diagnostics, text.trim_end());
1704    }
1705
1706    #[test]
1707    fn test_extract_email_url_information_skips_binary_string_text() {
1708        let mut builder = FileInfoBuilder::default();
1709        let options = TextDetectionOptions {
1710            collect_info: false,
1711            detect_packages: false,
1712            detect_application_packages: false,
1713            detect_system_packages: false,
1714            detect_packages_in_compiled: false,
1715            detect_copyrights: false,
1716            detect_generated: false,
1717            detect_emails: true,
1718            detect_urls: true,
1719            max_emails: 50,
1720            max_urls: 50,
1721            timeout_seconds: 120.0,
1722        };
1723
1724        extract_email_url_information(
1725            &mut builder,
1726            "contact 6h@fo.lwft and visit http://gmail.com/",
1727            &options,
1728            true,
1729        );
1730
1731        let file = builder
1732            .name("binary.bin".to_string())
1733            .base_name("binary".to_string())
1734            .extension(".bin".to_string())
1735            .path("binary.bin".to_string())
1736            .file_type(FileType::File)
1737            .size(1)
1738            .build()
1739            .expect("builder should produce file info");
1740
1741        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1742        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1743    }
1744
1745    #[test]
1746    fn test_extract_email_url_information_keeps_good_binary_contacts() {
1747        let mut builder = FileInfoBuilder::default();
1748        let options = TextDetectionOptions {
1749            collect_info: false,
1750            detect_packages: false,
1751            detect_application_packages: false,
1752            detect_system_packages: false,
1753            detect_packages_in_compiled: false,
1754            detect_copyrights: false,
1755            detect_generated: false,
1756            detect_emails: true,
1757            detect_urls: true,
1758            max_emails: 50,
1759            max_urls: 50,
1760            timeout_seconds: 120.0,
1761        };
1762
1763        extract_email_url_information(
1764            &mut builder,
1765            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1766            &options,
1767            true,
1768        );
1769
1770        let file = builder
1771            .name("binary.bin".to_string())
1772            .base_name("binary".to_string())
1773            .extension(".bin".to_string())
1774            .path("binary.bin".to_string())
1775            .file_type(FileType::File)
1776            .size(1)
1777            .build()
1778            .expect("builder should produce file info");
1779
1780        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1781        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1782        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1783        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1784    }
1785
1786    #[test]
1787    fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1788        let mut builder = FileInfoBuilder::default();
1789        let options = TextDetectionOptions {
1790            collect_info: false,
1791            detect_packages: false,
1792            detect_application_packages: false,
1793            detect_system_packages: false,
1794            detect_packages_in_compiled: false,
1795            detect_copyrights: false,
1796            detect_generated: false,
1797            detect_emails: true,
1798            detect_urls: false,
1799            max_emails: 2,
1800            max_urls: 50,
1801            timeout_seconds: 120.0,
1802        };
1803
1804        extract_email_url_information(
1805            &mut builder,
1806            "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1807            &options,
1808            true,
1809        );
1810
1811        let file = builder
1812            .name("binary.bin".to_string())
1813            .base_name("binary".to_string())
1814            .extension(".bin".to_string())
1815            .path("binary.bin".to_string())
1816            .file_type(FileType::File)
1817            .size(1)
1818            .build()
1819            .expect("builder should produce file info");
1820
1821        assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1822        assert_eq!(file.emails[0].email, "jakub@redhat.com");
1823        assert_eq!(file.emails[1].email, "contyk@redhat.com");
1824    }
1825
1826    #[test]
1827    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1828        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1829        assert!(!is_binary_string_copyright_candidate(gibberish));
1830    }
1831
1832    #[test]
1833    fn test_binary_string_copyright_candidate_keeps_real_notice() {
1834        let notice = "Copyright nexB and others (c) 2012";
1835        assert!(is_binary_string_copyright_candidate(notice));
1836    }
1837
1838    #[test]
1839    fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1840        assert!(!is_binary_string_copyright_candidate(
1841            "Copyright - split out libs"
1842        ));
1843    }
1844
1845    #[test]
1846    fn test_binary_string_email_candidate_rejects_gibberish() {
1847        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1848    }
1849
1850    #[test]
1851    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1852        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1853    }
1854
1855    #[test]
1856    fn test_binary_string_url_candidate_rejects_short_fake_host() {
1857        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1858    }
1859
1860    #[test]
1861    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1862        assert!(is_binary_string_url_candidate(
1863            "https://www.gnu.org/software/coreutils/"
1864        ));
1865    }
1866
1867    #[test]
1868    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1869        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1870    }
1871
1872    #[test]
1873    fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
1874        assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
1875    }
1876
1877    #[test]
1878    fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
1879        assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
1880    }
1881
1882    #[test]
1883    fn test_binary_string_url_candidate_keeps_short_project_path() {
1884        assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
1885    }
1886
1887    #[test]
1888    fn test_binary_string_author_candidate_keeps_named_author_with_email() {
1889        assert!(is_binary_string_author_candidate(
1890            "Andreas Schneider <asn@redhat.com>"
1891        ));
1892    }
1893
1894    #[test]
1895    fn test_binary_string_author_candidate_rejects_gibberish() {
1896        assert!(!is_binary_string_author_candidate(
1897            "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
1898        ));
1899    }
1900
1901    #[test]
1902    fn test_binary_string_author_candidate_rejects_changelog_phrase() {
1903        assert!(!is_binary_string_author_candidate(
1904            "Developers can enable them. - revert news user back to"
1905        ));
1906    }
1907
1908    #[test]
1909    fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
1910        assert_eq!(
1911            extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
1912            Some("Andreas Schneider <asn@redhat.com>".to_string())
1913        );
1914    }
1915
1916    #[test]
1917    fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
1918        assert_eq!(
1919            extract_named_author_from_binary_line(
1920                "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
1921            ),
1922            Some("Rob Crittenden (rcritten@redhat.com)".to_string())
1923        );
1924    }
1925
1926    #[test]
1927    fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
1928        assert_eq!(
1929            extract_named_author_from_binary_line(
1930                "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
1931            ),
1932            None
1933        );
1934    }
1935
1936    #[test]
1937    fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
1938        assert_eq!(
1939            extract_named_author_from_binary_line(
1940                "Changes as per initial review by panemade@gmail.com"
1941            ),
1942            Some("panemade@gmail.com".to_string())
1943        );
1944    }
1945
1946    #[test]
1947    fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
1948        assert!(!is_binary_string_author_candidate(
1949            "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
1950        ));
1951    }
1952
1953    #[test]
1954    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1955        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1956        let text = "alpha MIT omega";
1957        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1958        let mut detection = make_detection("");
1959        detection.matches[0].coordinates =
1960            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1961        detection.matches[0].start_token = 1;
1962        detection.matches[0].end_token = 2;
1963
1964        let percentage = compute_percentage_of_license_text(&query, &[detection]);
1965
1966        assert_eq!(percentage, 33.33);
1967    }
1968
1969    #[test]
1970    fn test_scan_options_fingerprint_changes_with_license_score() {
1971        let text_options = crate::scanner::TextDetectionOptions::default();
1972        let default_fingerprint = scan_options_fingerprint(
1973            &text_options,
1974            LicenseScanOptions {
1975                min_score: 0,
1976                ..LicenseScanOptions::default()
1977            },
1978            None,
1979        );
1980        let filtered_fingerprint = scan_options_fingerprint(
1981            &text_options,
1982            LicenseScanOptions {
1983                min_score: 70,
1984                ..LicenseScanOptions::default()
1985            },
1986            None,
1987        );
1988
1989        assert_ne!(default_fingerprint, filtered_fingerprint);
1990    }
1991
1992    #[test]
1993    fn test_is_go_non_production_source_for_test_filename() {
1994        let temp_dir = tempdir().unwrap();
1995        let path = temp_dir.path().join("scanner_test.go");
1996        fs::write(&path, "package scanner\n").unwrap();
1997
1998        assert!(is_go_non_production_source(&path).unwrap());
1999    }
2000
2001    #[test]
2002    fn test_is_go_non_production_source_for_build_tag() {
2003        let temp_dir = tempdir().unwrap();
2004        let path = temp_dir.path().join("scanner.go");
2005        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2006
2007        assert!(is_go_non_production_source(&path).unwrap());
2008    }
2009
2010    #[test]
2011    fn test_is_go_non_production_source_for_regular_go_file() {
2012        let temp_dir = tempdir().unwrap();
2013        let path = temp_dir.path().join("scanner.go");
2014        fs::write(&path, "package scanner\n").unwrap();
2015
2016        assert!(!is_go_non_production_source(&path).unwrap());
2017    }
2018}
provenant/scanner/process.rs

provenant/scanner/
process.rs