provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3    is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29    LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36    ExtractedTextKind, augment_license_detection_text, classify_file_info,
37    extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44    (
45        "-----BEGIN TRUSTED CERTIFICATE-----",
46        "-----END TRUSTED CERTIFICATE-----",
47    ),
48];
49
50pub fn process_collected(
51    collected: &CollectedPaths,
52    progress: Arc<ScanProgress>,
53    license_engine: Option<Arc<LicenseDetectionEngine>>,
54    license_options: LicenseScanOptions,
55    text_options: &TextDetectionOptions,
56) -> ProcessResult {
57    let mut all_files: Vec<FileInfo> = collected
58        .files
59        .par_iter()
60        .map(|(path, metadata)| {
61            let file_entry = process_file(
62                path,
63                metadata,
64                progress.as_ref(),
65                license_engine.clone(),
66                license_options,
67                text_options,
68            );
69            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70            file_entry
71        })
72        .collect();
73
74    for (path, metadata) in &collected.directories {
75        all_files.push(process_directory(
76            path,
77            metadata,
78            text_options.collect_info,
79            license_engine.is_some(),
80        ));
81    }
82
83    ProcessResult {
84        files: all_files,
85        excluded_count: collected.excluded_count,
86    }
87}
88
89pub fn process_collected_sequential(
90    collected: &CollectedPaths,
91    progress: Arc<ScanProgress>,
92    license_engine: Option<Arc<LicenseDetectionEngine>>,
93    license_options: LicenseScanOptions,
94    text_options: &TextDetectionOptions,
95) -> ProcessResult {
96    let mut all_files: Vec<FileInfo> =
97        Vec::with_capacity(collected.files.len() + collected.directories.len());
98
99    for (path, metadata) in &collected.files {
100        let file_entry = process_file(
101            path,
102            metadata,
103            progress.as_ref(),
104            license_engine.clone(),
105            license_options,
106            text_options,
107        );
108        progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
109        all_files.push(file_entry);
110    }
111
112    for (path, metadata) in &collected.directories {
113        all_files.push(process_directory(
114            path,
115            metadata,
116            text_options.collect_info,
117            license_engine.is_some(),
118        ));
119    }
120
121    ProcessResult {
122        files: all_files,
123        excluded_count: collected.excluded_count,
124    }
125}
126
127pub fn process_collected_with_memory_limit(
128    collected: &CollectedPaths,
129    progress: Arc<ScanProgress>,
130    license_engine: Option<Arc<LicenseDetectionEngine>>,
131    license_options: LicenseScanOptions,
132    text_options: &TextDetectionOptions,
133    max_in_memory: i64,
134) -> ProcessResult {
135    if max_in_memory == 0 {
136        return process_collected(
137            collected,
138            progress,
139            license_engine,
140            license_options,
141            text_options,
142        );
143    }
144
145    let memory_limit = if max_in_memory < 0 {
146        0
147    } else {
148        max_in_memory as usize
149    };
150    let chunk_size = if max_in_memory < 0 {
151        256
152    } else {
153        memory_limit.max(1)
154    };
155
156    let mut retained_files = Vec::new();
157    let mut spill_store = None;
158
159    for chunk in collected.files.chunks(chunk_size) {
160        let processed_chunk: Vec<FileInfo> = chunk
161            .par_iter()
162            .map(|(path, metadata)| {
163                let file_entry = process_file(
164                    path,
165                    metadata,
166                    progress.as_ref(),
167                    license_engine.clone(),
168                    license_options,
169                    text_options,
170                );
171                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
172                file_entry
173            })
174            .collect();
175
176        retain_or_spill_chunk(
177            processed_chunk,
178            &mut retained_files,
179            &mut spill_store,
180            memory_limit,
181        );
182    }
183
184    for (path, metadata) in &collected.directories {
185        let entry = process_directory(
186            path,
187            metadata,
188            text_options.collect_info,
189            license_engine.is_some(),
190        );
191        retain_or_spill_chunk(
192            vec![entry],
193            &mut retained_files,
194            &mut spill_store,
195            memory_limit,
196        );
197    }
198
199    if let Some(spill_store) = spill_store {
200        retained_files.extend(spill_store.load_all());
201    }
202
203    ProcessResult {
204        files: retained_files,
205        excluded_count: collected.excluded_count,
206    }
207}
208
209pub fn process_collected_with_memory_limit_sequential(
210    collected: &CollectedPaths,
211    progress: Arc<ScanProgress>,
212    license_engine: Option<Arc<LicenseDetectionEngine>>,
213    license_options: LicenseScanOptions,
214    text_options: &TextDetectionOptions,
215    max_in_memory: i64,
216) -> ProcessResult {
217    if max_in_memory == 0 {
218        return process_collected_sequential(
219            collected,
220            progress,
221            license_engine,
222            license_options,
223            text_options,
224        );
225    }
226
227    let memory_limit = if max_in_memory < 0 {
228        0
229    } else {
230        max_in_memory as usize
231    };
232    let chunk_size = if max_in_memory < 0 {
233        256
234    } else {
235        memory_limit.max(1)
236    };
237
238    let mut retained_files = Vec::new();
239    let mut spill_store = None;
240
241    for chunk in collected.files.chunks(chunk_size) {
242        let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
243        for (path, metadata) in chunk {
244            let file_entry = process_file(
245                path,
246                metadata,
247                progress.as_ref(),
248                license_engine.clone(),
249                license_options,
250                text_options,
251            );
252            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
253            processed_chunk.push(file_entry);
254        }
255
256        retain_or_spill_chunk(
257            processed_chunk,
258            &mut retained_files,
259            &mut spill_store,
260            memory_limit,
261        );
262    }
263
264    for (path, metadata) in &collected.directories {
265        let entry = process_directory(
266            path,
267            metadata,
268            text_options.collect_info,
269            license_engine.is_some(),
270        );
271        retain_or_spill_chunk(
272            vec![entry],
273            &mut retained_files,
274            &mut spill_store,
275            memory_limit,
276        );
277    }
278
279    if let Some(spill_store) = spill_store {
280        retained_files.extend(spill_store.load_all());
281    }
282
283    ProcessResult {
284        files: retained_files,
285        excluded_count: collected.excluded_count,
286    }
287}
288
289fn retain_or_spill_chunk(
290    chunk: Vec<FileInfo>,
291    retained_files: &mut Vec<FileInfo>,
292    spill_store: &mut Option<FileInfoSpillStore>,
293    memory_limit: usize,
294) {
295    if memory_limit == 0 {
296        spill_store
297            .get_or_insert_with(FileInfoSpillStore::new)
298            .spill(chunk);
299        return;
300    }
301
302    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
303    if remaining_capacity >= chunk.len() && spill_store.is_none() {
304        retained_files.extend(chunk);
305        return;
306    }
307
308    let mut chunk_iter = chunk.into_iter();
309    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
310    let overflow: Vec<FileInfo> = chunk_iter.collect();
311    if !overflow.is_empty() {
312        spill_store
313            .get_or_insert_with(FileInfoSpillStore::new)
314            .spill(overflow);
315    }
316}
317
318struct FileInfoSpillStore {
319    temp_dir: TempDir,
320    batch_index: usize,
321}
322
323impl FileInfoSpillStore {
324    fn new() -> Self {
325        Self {
326            temp_dir: TempDir::new().expect("create spill dir"),
327            batch_index: 0,
328        }
329    }
330
331    fn spill(&mut self, files: Vec<FileInfo>) {
332        let path = self
333            .temp_dir
334            .path()
335            .join(format!("batch-{:06}.json.zst", self.batch_index));
336        self.batch_index += 1;
337
338        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
339        let file = File::create(path).expect("create spill batch file");
340        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
341        encoder
342            .write_all(&payload)
343            .expect("write spilled file batch");
344        encoder.finish().expect("finish spill encoder");
345    }
346
347    fn load_all(self) -> Vec<FileInfo> {
348        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
349            .expect("read spill dir")
350            .filter_map(Result::ok)
351            .map(|entry| entry.path())
352            .collect();
353        paths.sort();
354
355        let mut files = Vec::new();
356        for path in paths {
357            let file = File::open(path).expect("open spill batch");
358            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
359            let mut payload = Vec::new();
360            decoder.read_to_end(&mut payload).expect("read spill batch");
361            let mut batch: Vec<FileInfo> =
362                serde_json::from_slice(&payload).expect("decode spilled file batch");
363            files.append(&mut batch);
364        }
365        files
366    }
367}
368
369fn process_file(
370    path: &Path,
371    metadata: &fs::Metadata,
372    progress: &ScanProgress,
373    license_engine: Option<Arc<LicenseDetectionEngine>>,
374    license_options: LicenseScanOptions,
375    text_options: &TextDetectionOptions,
376) -> FileInfo {
377    let mut scan_errors: Vec<String> = vec![];
378    let mut file_info_builder = FileInfoBuilder::default();
379    let license_enabled = license_engine.is_some();
380
381    let started = Instant::now();
382
383    let mut generated_flag = None;
384    let mut is_source_file = false;
385    match extract_information_from_content(
386        &mut file_info_builder,
387        &mut scan_errors,
388        path,
389        progress,
390        license_engine,
391        license_options,
392        text_options,
393    ) {
394        Ok((is_generated, sha256, is_source)) => {
395            generated_flag = is_generated;
396            is_source_file = is_source;
397            let _ = sha256;
398        }
399        Err(e) => scan_errors.push(e.to_string()),
400    };
401
402    maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
403
404    let mut file_info = file_info_builder
405        .name(path.file_name().unwrap().to_string_lossy().to_string())
406        .base_name(
407            path.file_stem()
408                .unwrap_or_default()
409                .to_string_lossy()
410                .to_string(),
411        )
412        .extension(
413            path.extension()
414                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
415        )
416        .path(path.to_string_lossy().to_string())
417        .file_type(FileType::File)
418        .size(metadata.len())
419        .date(
420            text_options
421                .collect_info
422                .then(|| get_creation_date(metadata))
423                .flatten(),
424        )
425        .scan_errors(scan_errors)
426        .build()
427        .expect("FileInformationBuild not completely initialized");
428
429    if text_options.collect_info {
430        file_info.is_source = Some(is_source_file);
431    }
432
433    if file_info.programming_language.as_deref() == Some("Go")
434        && is_go_non_production_source(path).unwrap_or(false)
435    {
436        file_info.is_source = Some(false);
437    }
438
439    if text_options.detect_generated {
440        file_info.is_generated = Some(generated_flag.unwrap_or(false));
441    }
442
443    if file_info.percentage_of_license_text.is_none() && license_enabled {
444        file_info.percentage_of_license_text = Some(0.0);
445    }
446
447    file_info
448}
449
450fn extract_information_from_content(
451    file_info_builder: &mut FileInfoBuilder,
452    scan_errors: &mut Vec<String>,
453    path: &Path,
454    progress: &ScanProgress,
455    license_engine: Option<Arc<LicenseDetectionEngine>>,
456    license_options: LicenseScanOptions,
457    text_options: &TextDetectionOptions,
458) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
459    let started = Instant::now();
460    let buffer = fs::read(path)?;
461    let license_enabled = license_engine.is_some();
462
463    if is_timeout_exceeded(started, text_options.timeout_seconds) {
464        return Err(Error::msg(format!(
465            "Timeout while reading file content (> {:.2}s)",
466            text_options.timeout_seconds
467        )));
468    }
469
470    let sha256 = calculate_sha256(&buffer);
471    let is_generated = text_options
472        .detect_generated
473        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
474    let classification = classify_file_info(path, &buffer);
475
476    if text_options.collect_info {
477        file_info_builder
478            .sha1(Some(calculate_sha1(&buffer)))
479            .md5(Some(calculate_md5(&buffer)))
480            .sha256(Some(sha256))
481            .programming_language(classification.programming_language.clone())
482            .mime_type(Some(classification.mime_type.clone()))
483            .file_type_label(Some(classification.file_type.clone()))
484            .sha1_git(Some(calculate_sha1_git(&buffer)))
485            .is_binary(Some(classification.is_binary))
486            .is_text(Some(classification.is_text))
487            .is_archive(Some(classification.is_archive))
488            .is_media(Some(classification.is_media))
489            .is_source(Some(classification.is_source))
490            .is_script(Some(classification.is_script))
491            .files_count(Some(0))
492            .dirs_count(Some(0))
493            .size_count(Some(0));
494    }
495
496    if should_skip_text_detection(path, &buffer) {
497        return Ok((is_generated, sha256, classification.is_source));
498    }
499
500    // Package parsing and text-based detection (copyright, license) are independent.
501    // Python ScanCode runs all enabled plugins on every file, so we do the same.
502    if text_options.detect_packages {
503        let started = Instant::now();
504        let parse_result = try_parse_file(path)
505            .or_else(|| {
506                text_options
507                    .detect_application_packages
508                    .then(|| try_parse_windows_executable_bytes(path, &buffer))
509                    .flatten()
510            })
511            .or_else(|| {
512                text_options
513                    .detect_packages_in_compiled
514                    .then(|| {
515                        (classification.is_binary && is_supported_compiled_binary_format(&buffer))
516                            .then(|| try_parse_compiled_bytes(&buffer))
517                            .flatten()
518                    })
519                    .flatten()
520            });
521
522        if let Some(parse_result) = parse_result {
523            let packages = parse_result
524                .packages
525                .into_iter()
526                .filter(|package| {
527                    let is_compiled_package = package
528                        .datasource_id
529                        .as_ref()
530                        .is_some_and(is_compiled_datasource);
531                    let is_system_package = package
532                        .datasource_id
533                        .as_ref()
534                        .is_some_and(is_system_datasource);
535                    if is_compiled_package {
536                        text_options.detect_packages_in_compiled
537                    } else if is_system_package {
538                        text_options.detect_system_packages
539                    } else {
540                        text_options.detect_application_packages
541                    }
542                })
543                .collect();
544            file_info_builder.package_data(packages);
545            scan_errors.extend(parse_result.scan_errors);
546        }
547        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
548    }
549
550    if is_timeout_exceeded(started, text_options.timeout_seconds) {
551        return Err(Error::msg(format!(
552            "Timeout while extracting package/text metadata (> {:.2}s)",
553            text_options.timeout_seconds
554        )));
555    }
556
557    let (text_content, text_kind, text_scan_error) =
558        extract_text_for_detection_with_diagnostics(path, &buffer);
559    if let Some(text_scan_error) = text_scan_error {
560        scan_errors.push(text_scan_error);
561    }
562    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
563
564    if is_timeout_exceeded(started, text_options.timeout_seconds) {
565        return Err(Error::msg(format!(
566            "Timeout while extracting text content (> {:.2}s)",
567            text_options.timeout_seconds
568        )));
569    }
570
571    if text_content.is_empty() {
572        return Ok((is_generated, sha256, classification.is_source));
573    }
574
575    if text_options.detect_copyrights {
576        extract_copyright_information(
577            file_info_builder,
578            path,
579            &text_content,
580            text_options.timeout_seconds,
581            from_binary_strings,
582        );
583    }
584    extract_email_url_information(
585        file_info_builder,
586        &text_content,
587        text_options,
588        from_binary_strings,
589    );
590
591    if is_timeout_exceeded(started, text_options.timeout_seconds) {
592        return Err(Error::msg(format!(
593            "Timeout before license scan (> {:.2}s)",
594            text_options.timeout_seconds
595        )));
596    }
597    // Handle source map files specially
598    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
599        if let Some(sourcemap_content) =
600            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
601        {
602            sourcemap_content
603        } else {
604            text_content
605        }
606    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
607        remove_verbatim_escape_sequences(&text_content)
608    } else {
609        text_content
610    };
611    let text_content_for_license_detection =
612        augment_license_detection_text(path, &text_content_for_license_detection);
613    let text_content_for_license_detection = text_content_for_license_detection.into_owned();
614
615    if license_enabled {
616        let started = Instant::now();
617        extract_license_information(
618            file_info_builder,
619            scan_errors,
620            path,
621            text_content_for_license_detection.clone(),
622            license_engine,
623            license_options,
624            from_binary_strings,
625        )?;
626        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
627    } else {
628        extract_license_information(
629            file_info_builder,
630            scan_errors,
631            path,
632            text_content_for_license_detection,
633            license_engine,
634            license_options,
635            from_binary_strings,
636        )?;
637    }
638
639    Ok((is_generated, sha256, classification.is_source))
640}
641
642fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
643    timeout_seconds.is_finite()
644        && timeout_seconds > 0.0
645        && started.elapsed().as_secs_f64() > timeout_seconds
646}
647
648fn maybe_record_processing_timeout(
649    scan_errors: &mut Vec<String>,
650    started: Instant,
651    timeout_seconds: f64,
652) {
653    if is_timeout_exceeded(started, timeout_seconds)
654        && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
655    {
656        scan_errors.push(format!(
657            "Processing interrupted due to timeout after {:.2} seconds",
658            timeout_seconds
659        ));
660    }
661}
662
663fn is_timeout_scan_error(error: &str) -> bool {
664    error.contains("Timeout while ")
665        || error.contains("Timeout before ")
666        || error.contains("Processing interrupted due to timeout")
667}
668
669fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
670    matches!(
671        datasource_id,
672        DatasourceId::AlpineInstalledDb
673            | DatasourceId::DebianDistrolessInstalledDb
674            | DatasourceId::DebianInstalledFilesList
675            | DatasourceId::DebianInstalledMd5Sums
676            | DatasourceId::DebianInstalledStatusDb
677            | DatasourceId::FreebsdCompactManifest
678            | DatasourceId::RpmInstalledDatabaseBdb
679            | DatasourceId::RpmInstalledDatabaseNdb
680            | DatasourceId::RpmInstalledDatabaseSqlite
681            | DatasourceId::RpmYumdb
682    )
683}
684
685fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
686    matches!(
687        datasource_id,
688        DatasourceId::GoBinary | DatasourceId::RustBinary
689    )
690}
691
692fn extract_copyright_information(
693    file_info_builder: &mut FileInfoBuilder,
694    path: &Path,
695    text_content: &str,
696    timeout_seconds: f64,
697    from_binary_strings: bool,
698) {
699    // CREDITS files get special handling (Linux kernel style).
700    if copyright::is_credits_file(path) {
701        let author_detections = copyright::detect_credits_authors(text_content);
702        if !author_detections.is_empty() {
703            file_info_builder.authors(
704                author_detections
705                    .into_iter()
706                    .map(|a| Author {
707                        author: a.author,
708                        start_line: a.start_line,
709                        end_line: a.end_line,
710                    })
711                    .collect(),
712            );
713            return;
714        }
715    }
716
717    let copyright_options = CopyrightDetectionOptions {
718        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
719            Some(Duration::from_secs_f64(timeout_seconds))
720        } else {
721            None
722        },
723        ..CopyrightDetectionOptions::default()
724    };
725
726    let (copyrights, holders, authors) =
727        copyright::detect_copyrights_with_options(text_content, &copyright_options);
728    let (copyrights, holders, authors) = if from_binary_strings {
729        prune_binary_string_detections(text_content, copyrights, holders, authors)
730    } else {
731        (copyrights, holders, authors)
732    };
733
734    file_info_builder.copyrights(
735        copyrights
736            .into_iter()
737            .map(|c| Copyright {
738                copyright: c.copyright,
739                start_line: c.start_line,
740                end_line: c.end_line,
741            })
742            .collect::<Vec<Copyright>>(),
743    );
744    file_info_builder.holders(
745        holders
746            .into_iter()
747            .map(|h| Holder {
748                holder: h.holder,
749                start_line: h.start_line,
750                end_line: h.end_line,
751            })
752            .collect::<Vec<Holder>>(),
753    );
754    file_info_builder.authors(
755        authors
756            .into_iter()
757            .map(|a| Author {
758                author: a.author,
759                start_line: a.start_line,
760                end_line: a.end_line,
761            })
762            .collect::<Vec<Author>>(),
763    );
764}
765
766fn prune_binary_string_detections(
767    text_content: &str,
768    copyrights: Vec<CopyrightDetection>,
769    holders: Vec<HolderDetection>,
770    authors: Vec<AuthorDetection>,
771) -> (
772    Vec<CopyrightDetection>,
773    Vec<HolderDetection>,
774    Vec<AuthorDetection>,
775) {
776    let kept_copyrights: Vec<CopyrightDetection> = copyrights
777        .into_iter()
778        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
779        .collect();
780
781    let kept_holders: Vec<HolderDetection> = holders
782        .into_iter()
783        .filter(|holder| {
784            kept_copyrights.iter().any(|copyright| {
785                ranges_overlap(
786                    holder.start_line,
787                    holder.end_line,
788                    copyright.start_line,
789                    copyright.end_line,
790                )
791            })
792        })
793        .collect();
794
795    let kept_authors = authors
796        .into_iter()
797        .filter(|author| is_binary_string_author_candidate(&author.author))
798        .chain(extract_binary_string_author_supplements(text_content))
799        .filter({
800            let mut seen = HashSet::new();
801            move |author| seen.insert(author.author.clone())
802        })
803        .collect();
804
805    (kept_copyrights, kept_holders, kept_authors)
806}
807
808fn ranges_overlap(
809    a_start: LineNumber,
810    a_end: LineNumber,
811    b_start: LineNumber,
812    b_end: LineNumber,
813) -> bool {
814    a_start <= b_end && b_start <= a_end
815}
816
817fn is_binary_string_copyright_candidate(text: &str) -> bool {
818    if contains_year(text) {
819        return true;
820    }
821
822    let trimmed = text.trim();
823    let lower = trimmed.to_ascii_lowercase();
824    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
825        tail.trim()
826    } else {
827        lower.trim()
828    };
829    let original_tail = if lower.starts_with("copyright") {
830        trimmed["copyright".len()..].trim()
831    } else {
832        trimmed
833    };
834
835    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
836        return false;
837    }
838
839    let alpha_tokens: Vec<&str> = tail
840        .split_whitespace()
841        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
842        .collect();
843
844    if alpha_tokens.len() <= 1 {
845        return has_explicit_copyright_marker(text)
846            && alpha_tokens.iter().any(|token| {
847                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
848            });
849    }
850
851    if !has_explicit_copyright_marker(text) {
852        return false;
853    }
854
855    has_binary_name_like_shape(original_tail)
856}
857
858fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
859    let mut authors = Vec::new();
860
861    for (line_index, line) in text_content.lines().enumerate() {
862        if let Some(author) = extract_named_author_from_binary_line(line) {
863            authors.push(AuthorDetection {
864                author,
865                start_line: LineNumber::from_0_indexed(line_index),
866                end_line: LineNumber::from_0_indexed(line_index),
867            });
868        }
869    }
870
871    authors
872}
873
874fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
875    let line = line.trim();
876    if line.is_empty() {
877        return None;
878    }
879
880    let emails = finder::find_emails(
881        line,
882        &DetectionConfig {
883            max_emails: 4,
884            max_urls: 0,
885            unique: false,
886        },
887    );
888    let email = emails.first()?.email.as_str();
889    if !is_binary_string_email_candidate(email) {
890        return None;
891    }
892
893    let lower_line = line.to_ascii_lowercase();
894    let email_start = lower_line.find(email)?;
895    let raw_prefix = &line[..email_start];
896    let has_author_marker = contains_binary_author_marker(raw_prefix);
897    let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
898    let prefix = prefix
899        .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
900        .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
901        .trim();
902
903    let (name, _) = split_name_email(prefix);
904    let name = name.or_else(|| {
905        let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
906        (!trimmed.is_empty()).then(|| trimmed.to_string())
907    });
908
909    let Some(name) = name.map(|name| name.trim().to_string()) else {
910        if has_author_marker {
911            return Some(email.to_string());
912        }
913        return None;
914    };
915
916    if name.is_empty() && has_author_marker {
917        return Some(email.to_string());
918    }
919
920    if !has_binary_name_like_shape(&name) {
921        return None;
922    }
923
924    if line.contains(&format!("<{email}>")) {
925        Some(format!("{name} <{email}>"))
926    } else if line.contains(&format!("({email})")) {
927        Some(format!("{name} ({email})"))
928    } else {
929        Some(format!("{name} {email}"))
930    }
931}
932
933fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
934    let lower = text.to_ascii_lowercase();
935    let idx = lower.rfind(marker)?;
936    Some(text[idx + marker.len()..].trim())
937}
938
939fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
940    const MARKERS: &[&str] = &[
941        " patch author: ",
942        " patch author ",
943        " written by ",
944        " contributed by ",
945        " original work done by ",
946        " work done by ",
947        " thanks to ",
948        " review by ",
949        " by ",
950        " from ",
951    ];
952
953    MARKERS
954        .iter()
955        .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
956        .next()
957}
958
959fn contains_binary_author_marker(text: &str) -> bool {
960    take_suffix_after_last_author_marker(text).is_some()
961}
962
963fn has_binary_name_like_shape(text: &str) -> bool {
964    let trimmed = text.trim();
965    if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
966    {
967        return false;
968    }
969
970    let tokens: Vec<&str> = trimmed
971        .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
972        .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
973        .collect();
974    if tokens.is_empty() {
975        return false;
976    }
977
978    let uppercase_like = tokens
979        .iter()
980        .filter(|token| {
981            let token = token.trim_matches('.');
982            token
983                .chars()
984                .find(|c| c.is_ascii_alphabetic())
985                .is_some_and(|c| c.is_ascii_uppercase())
986        })
987        .count();
988
989    uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
990        || tokens
991            .iter()
992            .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
993}
994
995fn has_sufficient_alphabetic_content(text: &str) -> bool {
996    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
997    if alnum_count == 0 {
998        return false;
999    }
1000
1001    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1002    alpha_count * 2 >= alnum_count
1003}
1004
1005fn has_excessive_at_noise(text: &str) -> bool {
1006    text.chars().filter(|c| *c == '@').count() >= 3
1007}
1008
1009fn has_explicit_copyright_marker(text: &str) -> bool {
1010    let lower = text.to_ascii_lowercase();
1011    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1012}
1013
1014fn contains_year(text: &str) -> bool {
1015    let bytes = text.as_bytes();
1016    bytes.windows(4).any(|window| {
1017        window.iter().all(|b| b.is_ascii_digit())
1018            && matches!(window[0], b'1' | b'2')
1019            && matches!(window[1], b'9' | b'0')
1020    })
1021}
1022
1023fn is_company_like_suffix(token: &str) -> bool {
1024    matches!(
1025        token.to_ascii_lowercase().as_str(),
1026        "inc"
1027            | "corp"
1028            | "corporation"
1029            | "co"
1030            | "company"
1031            | "ltd"
1032            | "llc"
1033            | "gmbh"
1034            | "foundation"
1035            | "project"
1036            | "systems"
1037            | "software"
1038            | "technologies"
1039            | "technology"
1040    )
1041}
1042
1043fn extract_email_url_information(
1044    file_info_builder: &mut FileInfoBuilder,
1045    text_content: &str,
1046    text_options: &TextDetectionOptions,
1047    from_binary_strings: bool,
1048) {
1049    if !text_options.detect_emails && !text_options.detect_urls {
1050        return;
1051    }
1052
1053    if text_options.detect_emails {
1054        let config = DetectionConfig {
1055            max_emails: text_options.max_emails,
1056            max_urls: text_options.max_urls,
1057            unique: from_binary_strings,
1058        };
1059        let emails = finder::find_emails(text_content, &config)
1060            .into_iter()
1061            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1062            .map(|d| OutputEmail {
1063                email: d.email,
1064                start_line: d.start_line,
1065                end_line: d.end_line,
1066            })
1067            .collect::<Vec<_>>();
1068        file_info_builder.emails(emails);
1069    }
1070
1071    if text_options.detect_urls {
1072        let config = DetectionConfig {
1073            max_emails: text_options.max_emails,
1074            max_urls: text_options.max_urls,
1075            unique: true,
1076        };
1077        let urls = finder::find_urls(text_content, &config)
1078            .into_iter()
1079            .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
1080            .map(|d| OutputURL {
1081                url: d.url,
1082                start_line: d.start_line,
1083                end_line: d.end_line,
1084            })
1085            .collect::<Vec<_>>();
1086        file_info_builder.urls(urls);
1087    }
1088}
1089
1090fn is_binary_string_email_candidate(email: &str) -> bool {
1091    let Some((local, domain)) = email.rsplit_once('@') else {
1092        return false;
1093    };
1094
1095    if !has_strong_binary_local_part(local) {
1096        return false;
1097    }
1098
1099    has_strong_binary_host_shape(domain)
1100}
1101
1102fn is_binary_string_url_candidate(url: &str) -> bool {
1103    let parsed = url::Url::parse(url).ok();
1104    let Some(parsed) = parsed else {
1105        return false;
1106    };
1107    let Some(host) = parsed.host_str() else {
1108        return false;
1109    };
1110
1111    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1112}
1113
1114fn is_binary_string_author_candidate(author: &str) -> bool {
1115    let trimmed = author.trim();
1116    if trimmed.is_empty()
1117        || !has_sufficient_alphabetic_content(trimmed)
1118        || has_excessive_at_noise(trimmed)
1119    {
1120        return false;
1121    }
1122
1123    if trimmed.contains('@') {
1124        let emails = finder::find_emails(
1125            trimmed,
1126            &DetectionConfig {
1127                max_emails: 4,
1128                max_urls: 0,
1129                unique: true,
1130            },
1131        );
1132        if emails.len() > 1 {
1133            return false;
1134        }
1135
1136        if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1137            return !extracted.is_empty();
1138        }
1139
1140        let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1141            return false;
1142        };
1143        if !is_binary_string_email_candidate(email) {
1144            return false;
1145        }
1146
1147        let (name, _) = split_name_email(trimmed);
1148        return name.as_deref().is_some_and(has_binary_name_like_shape);
1149    }
1150
1151    has_binary_name_like_shape(trimmed)
1152}
1153
1154fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1155    if parsed.path() != "/"
1156        && parsed
1157            .path()
1158            .split('/')
1159            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1160    {
1161        return true;
1162    }
1163
1164    if parsed.query().is_some() || parsed.fragment().is_some() {
1165        return true;
1166    }
1167
1168    let Some(host) = parsed.host_str() else {
1169        return false;
1170    };
1171
1172    let labels: Vec<&str> = host.split('.').collect();
1173    if labels.len() > 2 {
1174        return labels[..labels.len() - 1].iter().any(|label| {
1175            label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1176        });
1177    }
1178
1179    if matches!(labels.first(), Some(&"www")) {
1180        return true;
1181    }
1182
1183    if labels.len() == 2 {
1184        let domain = labels[0];
1185        let tld = labels[1];
1186        if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1187            return true;
1188        }
1189    }
1190
1191    labels
1192        .iter()
1193        .take(labels.len().saturating_sub(1))
1194        .any(|label| {
1195            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1196        })
1197}
1198
1199fn has_strong_binary_local_part(local: &str) -> bool {
1200    local
1201        .split(|c: char| !c.is_ascii_alphabetic())
1202        .any(|segment| segment.len() >= 3)
1203}
1204
1205fn has_strong_binary_host_shape(host: &str) -> bool {
1206    let labels: Vec<&str> = host.split('.').collect();
1207    if labels.len() < 2 {
1208        return false;
1209    }
1210
1211    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1212        &labels[1..]
1213    } else {
1214        &labels[..]
1215    };
1216
1217    if relevant.len() < 2 {
1218        return false;
1219    }
1220
1221    relevant[..relevant.len() - 1].iter().any(|label| {
1222        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1223    })
1224}
1225
1226fn extract_license_information(
1227    file_info_builder: &mut FileInfoBuilder,
1228    scan_errors: &mut Vec<String>,
1229    path: &Path,
1230    text_content: String,
1231    license_engine: Option<Arc<LicenseDetectionEngine>>,
1232    license_options: LicenseScanOptions,
1233    from_binary_strings: bool,
1234) -> Result<(), Error> {
1235    let Some(engine) = license_engine else {
1236        return Ok(());
1237    };
1238
1239    let detection_result = if license_options.min_score == 0 {
1240        engine.detect_with_kind_and_source(
1241            &text_content,
1242            license_options.unknown_licenses,
1243            from_binary_strings,
1244            &path.to_string_lossy(),
1245        )
1246    } else {
1247        engine.detect_with_kind_and_source_with_score(
1248            &text_content,
1249            license_options.unknown_licenses,
1250            from_binary_strings,
1251            &path.to_string_lossy(),
1252            license_options.min_score as f32,
1253        )
1254    };
1255
1256    match detection_result {
1257        Ok(detections) => {
1258            let query =
1259                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1260            let mut model_detections = Vec::new();
1261            let mut model_clues = Vec::new();
1262
1263            for detection in &detections {
1264                let (public_detection, clue_matches) = convert_detection_to_model(
1265                    detection,
1266                    license_options,
1267                    &text_content,
1268                    query.as_ref(),
1269                );
1270
1271                if let Some(public_detection) = public_detection {
1272                    model_detections.push(public_detection);
1273                }
1274
1275                model_clues.extend(clue_matches);
1276            }
1277
1278            if !model_detections.is_empty() {
1279                let expressions: Vec<String> = model_detections
1280                    .iter()
1281                    .filter(|d| !d.license_expression_spdx.is_empty())
1282                    .map(|d| d.license_expression_spdx.clone())
1283                    .collect();
1284
1285                if !expressions.is_empty() {
1286                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
1287                    if let Some(expr) = combined {
1288                        file_info_builder.license_expression(Some(expr));
1289                    }
1290                }
1291            }
1292
1293            file_info_builder.license_detections(model_detections);
1294            file_info_builder.license_clues(model_clues);
1295            file_info_builder.percentage_of_license_text(
1296                query
1297                    .as_ref()
1298                    .map(|query| compute_percentage_of_license_text(query, &detections)),
1299            );
1300        }
1301        Err(e) => {
1302            scan_errors.push(format!("License detection failed: {}", e));
1303        }
1304    }
1305
1306    Ok(())
1307}
1308
1309fn convert_detection_to_model(
1310    detection: &crate::license_detection::LicenseDetection,
1311    license_options: LicenseScanOptions,
1312    text_content: &str,
1313    query: Option<&Query<'_>>,
1314) -> (Option<LicenseDetection>, Vec<Match>) {
1315    let matches: Vec<Match> = detection
1316        .matches
1317        .iter()
1318        .map(|m| convert_match_to_model(m, license_options, text_content, query))
1319        .collect();
1320
1321    if let Some(license_expression) = detection.license_expression.clone() {
1322        (
1323            Some(LicenseDetection {
1324                license_expression,
1325                license_expression_spdx: detection
1326                    .license_expression_spdx
1327                    .clone()
1328                    .unwrap_or_default(),
1329                matches,
1330                detection_log: if license_options.include_diagnostics {
1331                    detection.detection_log.clone()
1332                } else {
1333                    Vec::new()
1334                },
1335                identifier: detection.identifier.clone(),
1336            }),
1337            Vec::new(),
1338        )
1339    } else {
1340        (None, matches)
1341    }
1342}
1343
1344fn convert_match_to_model(
1345    m: &crate::license_detection::models::LicenseMatch,
1346    license_options: LicenseScanOptions,
1347    text_content: &str,
1348    query: Option<&Query<'_>>,
1349) -> Match {
1350    let rule_url = if m.rule_url.is_empty() {
1351        None
1352    } else {
1353        Some(m.rule_url.clone())
1354    };
1355    let matched_text = if license_options.include_text {
1356        m.matched_text.clone().or_else(|| {
1357            Some(crate::license_detection::query::matched_text_from_text(
1358                text_content,
1359                m.start_line.get(),
1360                m.end_line.get(),
1361            ))
1362        })
1363    } else {
1364        None
1365    };
1366    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1367        query.map(|query| matched_text_diagnostics_from_match(query, m))
1368    } else {
1369        None
1370    };
1371    Match {
1372        license_expression: m.license_expression.clone(),
1373        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1374        from_file: m.from_file.clone(),
1375        start_line: m.start_line,
1376        end_line: m.end_line,
1377        matcher: Some(m.matcher.to_string()),
1378        score: m.score,
1379        matched_length: Some(m.matched_length),
1380        match_coverage: Some(((m.coverage() as f64) * 100.0).round() / 100.0),
1381        rule_relevance: Some(m.rule_relevance),
1382        rule_identifier: Some(m.rule_identifier.clone()),
1383        rule_url,
1384        matched_text,
1385        referenced_filenames: m.referenced_filenames.clone(),
1386        matched_text_diagnostics,
1387    }
1388}
1389
1390fn compute_percentage_of_license_text(
1391    query: &Query<'_>,
1392    detections: &[crate::license_detection::LicenseDetection],
1393) -> f64 {
1394    let matched_positions: std::collections::HashSet<usize> = detections
1395        .iter()
1396        .flat_map(|detection| detection.matches.iter())
1397        .flat_map(|m| m.query_span().iter())
1398        .collect();
1399
1400    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1401    if query_tokens_length == 0 {
1402        return 0.0;
1403    }
1404
1405    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1406    (percentage * 100.0).round() / 100.0
1407}
1408
1409fn matched_text_diagnostics_from_match(
1410    query: &Query<'_>,
1411    license_match: &InternalLicenseMatch,
1412) -> String {
1413    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1414    let Some(start_pos) = matched_positions.iter().min() else {
1415        return crate::license_detection::query::matched_text_from_text(
1416            &query.text,
1417            license_match.start_line.get(),
1418            license_match.end_line.get(),
1419        );
1420    };
1421    let Some(end_pos) = matched_positions.iter().max() else {
1422        return crate::license_detection::query::matched_text_from_text(
1423            &query.text,
1424            license_match.start_line.get(),
1425            license_match.end_line.get(),
1426        );
1427    };
1428
1429    crate::license_detection::query::matched_text_diagnostics_from_text(
1430        &query.text,
1431        query,
1432        &matched_positions,
1433        start_pos,
1434        end_pos,
1435        license_match.start_line.get(),
1436        license_match.end_line.get(),
1437    )
1438}
1439
1440fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1441    is_pem_certificate_file(path, buffer)
1442}
1443
1444fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1445    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1446        return Ok(false);
1447    }
1448
1449    if path
1450        .file_name()
1451        .and_then(|name| name.to_str())
1452        .is_some_and(|name| name.ends_with("_test.go"))
1453    {
1454        return Ok(true);
1455    }
1456
1457    let content = fs::read_to_string(path)?;
1458    Ok(content.lines().take(10).any(|line| {
1459        let trimmed = line.trim();
1460        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1461            && trimmed.split_whitespace().any(|token| token == "test")
1462    }))
1463}
1464
1465fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1466    let prefix_len = buffer.len().min(8192);
1467    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1468    let trimmed_lines: Vec<&str> = prefix
1469        .lines()
1470        .map(str::trim)
1471        .filter(|line| !line.is_empty())
1472        .take(64)
1473        .collect();
1474
1475    let Some(first_line) = trimmed_lines.first().copied() else {
1476        return false;
1477    };
1478
1479    PEM_CERTIFICATE_HEADERS
1480        .iter()
1481        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1482}
1483
1484fn process_directory(
1485    path: &Path,
1486    _metadata: &fs::Metadata,
1487    collect_info: bool,
1488    license_enabled: bool,
1489) -> FileInfo {
1490    let name = path
1491        .file_name()
1492        .unwrap_or_default()
1493        .to_string_lossy()
1494        .to_string();
1495    let base_name = name.clone(); // For directories, base_name is the same as name
1496
1497    FileInfo {
1498        name,
1499        base_name,
1500        extension: "".to_string(),
1501        path: path.to_string_lossy().to_string(),
1502        file_type: FileType::Directory,
1503        mime_type: None,
1504        file_type_label: None,
1505        size: 0,
1506        date: None,
1507        sha1: None,
1508        md5: None,
1509        sha256: None,
1510        sha1_git: None,
1511        programming_language: None,
1512        package_data: Vec::new(),
1513        license_expression: None,
1514        license_detections: Vec::new(),
1515        license_clues: Vec::new(),
1516        percentage_of_license_text: license_enabled.then_some(0.0),
1517        copyrights: Vec::new(),
1518        holders: Vec::new(),
1519        authors: Vec::new(),
1520        emails: Vec::new(),
1521        urls: Vec::new(),
1522        for_packages: Vec::new(),
1523        scan_errors: Vec::new(),
1524        license_policy: None,
1525        is_binary: collect_info.then_some(false),
1526        is_text: collect_info.then_some(false),
1527        is_archive: collect_info.then_some(false),
1528        is_media: collect_info.then_some(false),
1529        is_source: collect_info.then_some(false),
1530        is_script: collect_info.then_some(false),
1531        files_count: collect_info.then_some(0),
1532        dirs_count: collect_info.then_some(0),
1533        size_count: collect_info.then_some(0),
1534        source_count: None,
1535        is_legal: false,
1536        is_manifest: false,
1537        is_readme: false,
1538        is_top_level: false,
1539        is_key_file: false,
1540        is_community: false,
1541        is_generated: None,
1542        facets: vec![],
1543        tallies: None,
1544    }
1545}
1546
1547#[cfg(test)]
1548mod tests {
1549    use super::{
1550        compute_percentage_of_license_text, convert_detection_to_model,
1551        extract_email_url_information, extract_named_author_from_binary_line,
1552        is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1553        is_binary_string_email_candidate, is_binary_string_url_candidate,
1554        is_go_non_production_source, process_file,
1555    };
1556    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1557    use crate::license_detection::index::LicenseIndex;
1558    use crate::license_detection::index::dictionary::TokenDictionary;
1559    use crate::license_detection::models::position_span::PositionSpan;
1560    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1561    use crate::license_detection::query::Query;
1562    use crate::models::{FileInfoBuilder, FileType, MatchScore};
1563    use crate::progress::{ProgressMode, ScanProgress};
1564    use crate::scanner::scan_options_fingerprint;
1565    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1566    use std::fs;
1567    use std::time::{Duration, Instant};
1568    use tempfile::tempdir;
1569
1570    use super::maybe_record_processing_timeout;
1571
1572    use crate::models::LineNumber;
1573
1574    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1575        LicenseMatch {
1576            rid: 0,
1577            license_expression: "mit".to_string(),
1578            license_expression_spdx: Some("MIT".to_string()),
1579            from_file: None,
1580            start_line: LineNumber::ONE,
1581            end_line: LineNumber::ONE,
1582            start_token: 0,
1583            end_token: 1,
1584            matcher: MatcherKind::Hash,
1585            score: MatchScore::from_percentage(1.0),
1586            matched_length: 3,
1587            rule_length: 3,
1588            match_coverage: 100.0,
1589            rule_relevance: 100,
1590            rule_identifier: "mit.LICENSE".to_string(),
1591            rule_url: rule_url.to_string(),
1592            matched_text: Some("MIT".to_string()),
1593            referenced_filenames: None,
1594            rule_kind: RuleKind::Text,
1595            is_from_license: true,
1596            rule_start_token: 0,
1597            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1598            candidate_resemblance: 0.0,
1599            candidate_containment: 0.0,
1600        }
1601    }
1602
1603    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1604        InternalLicenseDetection {
1605            license_expression: Some("mit".to_string()),
1606            license_expression_spdx: Some("MIT".to_string()),
1607            matches: vec![make_internal_match(rule_url)],
1608            detection_log: vec![],
1609            identifier: Some("mit-test".to_string()),
1610            file_regions: Vec::new(),
1611        }
1612    }
1613
1614    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1615        let dictionary = TokenDictionary::new_with_legalese(entries);
1616        let mut index = LicenseIndex::new(dictionary);
1617        index.len_legalese = len_legalese;
1618        index
1619    }
1620
1621    #[test]
1622    fn test_convert_detection_to_model_preserves_rule_url() {
1623        let detection = make_detection(
1624            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1625        );
1626
1627        let (converted, clues) =
1628            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1629        let converted = converted.expect("detection should convert");
1630
1631        assert_eq!(
1632            converted.matches[0].rule_url.as_deref(),
1633            Some(
1634                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1635            )
1636        );
1637        assert!(clues.is_empty());
1638    }
1639
1640    #[test]
1641    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1642        let detection = make_detection("");
1643
1644        let (converted, clues) =
1645            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1646        let converted = converted.expect("detection should convert");
1647
1648        assert_eq!(converted.matches[0].rule_url, None);
1649        assert!(clues.is_empty());
1650    }
1651
1652    #[test]
1653    fn test_convert_detection_to_model_rounds_match_coverage() {
1654        let mut detection = make_detection("");
1655        detection.matches[0].score = MatchScore::from_percentage(81.82);
1656        detection.matches[0].match_coverage = 33.334;
1657
1658        let (converted, clues) =
1659            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1660        let converted = converted.expect("detection should convert");
1661
1662        assert_eq!(
1663            converted.matches[0].score,
1664            MatchScore::from_percentage(81.82)
1665        );
1666        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1667        assert!(clues.is_empty());
1668    }
1669
1670    #[test]
1671    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1672        let mut detection = make_detection(
1673            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1674        );
1675        detection.license_expression = None;
1676        detection.license_expression_spdx = None;
1677        detection.identifier = None;
1678        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1679        detection.matches[0].license_expression_spdx =
1680            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1681        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1682        detection.matches[0].rule_kind = RuleKind::Clue;
1683
1684        let (converted, clues) = convert_detection_to_model(
1685            &detection,
1686            LicenseScanOptions {
1687                include_text: true,
1688                min_score: 0,
1689                ..LicenseScanOptions::default()
1690            },
1691            "clue text",
1692            None,
1693        );
1694
1695        assert!(converted.is_none());
1696        assert_eq!(clues.len(), 1);
1697        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1698        assert_eq!(
1699            clues[0].license_expression_spdx,
1700            "LicenseRef-scancode-unknown-license-reference"
1701        );
1702        assert_eq!(
1703            clues[0].rule_identifier.as_deref(),
1704            Some("license-clue_1.RULE")
1705        );
1706        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1707        assert_eq!(clues[0].matched_text_diagnostics, None);
1708    }
1709
1710    #[test]
1711    fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1712        let dir = tempdir().expect("tempdir");
1713        let path = dir.path().join("broken.pdf");
1714        fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1715            .expect("write malformed pdf");
1716        let metadata = fs::metadata(&path).expect("metadata");
1717        let progress = ScanProgress::new(ProgressMode::Quiet);
1718
1719        let file_info = process_file(
1720            &path,
1721            &metadata,
1722            &progress,
1723            None,
1724            LicenseScanOptions::default(),
1725            &TextDetectionOptions::default(),
1726        );
1727
1728        assert!(file_info.scan_errors.is_empty());
1729    }
1730
1731    #[test]
1732    fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1733        let started = Instant::now() - Duration::from_secs(2);
1734        let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1735
1736        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1737
1738        assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1739    }
1740
1741    #[test]
1742    fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1743        let started = Instant::now() - Duration::from_secs(2);
1744        let mut scan_errors = Vec::new();
1745
1746        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1747
1748        assert_eq!(
1749            scan_errors,
1750            vec!["Processing interrupted due to timeout after 1.00 seconds"]
1751        );
1752    }
1753
1754    #[test]
1755    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1756        let text = concat!(
1757            "Reproduction and distribution of this file, with or without modification, are\n",
1758            "permitted in any medium without royalties provided the copyright notice\n",
1759            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1760        );
1761        let index = create_test_index(
1762            &[
1763                ("reproduction", 0),
1764                ("distribution", 1),
1765                ("file", 2),
1766                ("without", 3),
1767                ("modification", 4),
1768                ("permitted", 5),
1769                ("medium", 6),
1770                ("royalties", 7),
1771                ("provided", 8),
1772                ("copyright", 9),
1773                ("notice", 10),
1774                ("preserved", 11),
1775                ("offered", 12),
1776                ("warranties", 13),
1777            ],
1778            14,
1779        );
1780        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1781        let mut detection = make_detection(
1782            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1783        );
1784        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1785        detection.matches[0].license_expression = "fsf-ap".to_string();
1786        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1787        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1788        detection.matches[0].matched_text = None;
1789        detection.matches[0].start_line = LineNumber::ONE;
1790        detection.matches[0].end_line = LineNumber::new(3).unwrap();
1791        detection.matches[0].start_token = 0;
1792        detection.matches[0].end_token = query.tokens.len();
1793        detection.matches[0].coordinates =
1794            MatchCoordinates::query_region(PositionSpan::from_positions(
1795                query
1796                    .tokens
1797                    .iter()
1798                    .enumerate()
1799                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1800                    .collect::<Vec<_>>(),
1801            ));
1802        detection.identifier = Some("fsf_ap-test".to_string());
1803
1804        let (converted, clues) = convert_detection_to_model(
1805            &detection,
1806            LicenseScanOptions {
1807                include_text: true,
1808                include_text_diagnostics: true,
1809                include_diagnostics: true,
1810                unknown_licenses: false,
1811                min_score: 0,
1812            },
1813            text,
1814            Some(&query),
1815        );
1816        let converted = converted.expect("detection should convert");
1817
1818        assert!(clues.is_empty());
1819        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1820        assert_eq!(
1821            converted.matches[0].matched_text.as_deref(),
1822            Some(text.trim_end())
1823        );
1824        let diagnostics = converted.matches[0]
1825            .matched_text_diagnostics
1826            .as_deref()
1827            .expect("diagnostics should be present");
1828        assert!(diagnostics.contains('['));
1829        assert!(diagnostics.contains(']'));
1830        assert_ne!(diagnostics, text.trim_end());
1831    }
1832
1833    #[test]
1834    fn test_extract_email_url_information_skips_binary_string_text() {
1835        let mut builder = FileInfoBuilder::default();
1836        let options = TextDetectionOptions {
1837            collect_info: false,
1838            detect_packages: false,
1839            detect_application_packages: false,
1840            detect_system_packages: false,
1841            detect_packages_in_compiled: false,
1842            detect_copyrights: false,
1843            detect_generated: false,
1844            detect_emails: true,
1845            detect_urls: true,
1846            max_emails: 50,
1847            max_urls: 50,
1848            timeout_seconds: 120.0,
1849        };
1850
1851        extract_email_url_information(
1852            &mut builder,
1853            "contact 6h@fo.lwft and visit http://gmail.com/",
1854            &options,
1855            true,
1856        );
1857
1858        let file = builder
1859            .name("binary.bin".to_string())
1860            .base_name("binary".to_string())
1861            .extension(".bin".to_string())
1862            .path("binary.bin".to_string())
1863            .file_type(FileType::File)
1864            .size(1)
1865            .build()
1866            .expect("builder should produce file info");
1867
1868        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1869        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1870    }
1871
1872    #[test]
1873    fn test_extract_email_url_information_keeps_good_binary_contacts() {
1874        let mut builder = FileInfoBuilder::default();
1875        let options = TextDetectionOptions {
1876            collect_info: false,
1877            detect_packages: false,
1878            detect_application_packages: false,
1879            detect_system_packages: false,
1880            detect_packages_in_compiled: false,
1881            detect_copyrights: false,
1882            detect_generated: false,
1883            detect_emails: true,
1884            detect_urls: true,
1885            max_emails: 50,
1886            max_urls: 50,
1887            timeout_seconds: 120.0,
1888        };
1889
1890        extract_email_url_information(
1891            &mut builder,
1892            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1893            &options,
1894            true,
1895        );
1896
1897        let file = builder
1898            .name("binary.bin".to_string())
1899            .base_name("binary".to_string())
1900            .extension(".bin".to_string())
1901            .path("binary.bin".to_string())
1902            .file_type(FileType::File)
1903            .size(1)
1904            .build()
1905            .expect("builder should produce file info");
1906
1907        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1908        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1909        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1910        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1911    }
1912
1913    #[test]
1914    fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1915        let mut builder = FileInfoBuilder::default();
1916        let options = TextDetectionOptions {
1917            collect_info: false,
1918            detect_packages: false,
1919            detect_application_packages: false,
1920            detect_system_packages: false,
1921            detect_packages_in_compiled: false,
1922            detect_copyrights: false,
1923            detect_generated: false,
1924            detect_emails: true,
1925            detect_urls: false,
1926            max_emails: 2,
1927            max_urls: 50,
1928            timeout_seconds: 120.0,
1929        };
1930
1931        extract_email_url_information(
1932            &mut builder,
1933            "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1934            &options,
1935            true,
1936        );
1937
1938        let file = builder
1939            .name("binary.bin".to_string())
1940            .base_name("binary".to_string())
1941            .extension(".bin".to_string())
1942            .path("binary.bin".to_string())
1943            .file_type(FileType::File)
1944            .size(1)
1945            .build()
1946            .expect("builder should produce file info");
1947
1948        assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1949        assert_eq!(file.emails[0].email, "jakub@redhat.com");
1950        assert_eq!(file.emails[1].email, "contyk@redhat.com");
1951    }
1952
1953    #[test]
1954    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1955        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1956        assert!(!is_binary_string_copyright_candidate(gibberish));
1957    }
1958
1959    #[test]
1960    fn test_binary_string_copyright_candidate_keeps_real_notice() {
1961        let notice = "Copyright nexB and others (c) 2012";
1962        assert!(is_binary_string_copyright_candidate(notice));
1963    }
1964
1965    #[test]
1966    fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1967        assert!(!is_binary_string_copyright_candidate(
1968            "Copyright - split out libs"
1969        ));
1970    }
1971
1972    #[test]
1973    fn test_binary_string_email_candidate_rejects_gibberish() {
1974        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1975    }
1976
1977    #[test]
1978    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1979        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1980    }
1981
1982    #[test]
1983    fn test_binary_string_url_candidate_rejects_short_fake_host() {
1984        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1985    }
1986
1987    #[test]
1988    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1989        assert!(is_binary_string_url_candidate(
1990            "https://www.gnu.org/software/coreutils/"
1991        ));
1992    }
1993
1994    #[test]
1995    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1996        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1997    }
1998
1999    #[test]
2000    fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2001        assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2002    }
2003
2004    #[test]
2005    fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2006        assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2007    }
2008
2009    #[test]
2010    fn test_binary_string_url_candidate_keeps_short_project_path() {
2011        assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2012    }
2013
2014    #[test]
2015    fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2016        assert!(is_binary_string_author_candidate(
2017            "Andreas Schneider <asn@redhat.com>"
2018        ));
2019    }
2020
2021    #[test]
2022    fn test_binary_string_author_candidate_rejects_gibberish() {
2023        assert!(!is_binary_string_author_candidate(
2024            "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2025        ));
2026    }
2027
2028    #[test]
2029    fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2030        assert!(!is_binary_string_author_candidate(
2031            "Developers can enable them. - revert news user back to"
2032        ));
2033    }
2034
2035    #[test]
2036    fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2037        assert_eq!(
2038            extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2039            Some("Andreas Schneider <asn@redhat.com>".to_string())
2040        );
2041    }
2042
2043    #[test]
2044    fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2045        assert_eq!(
2046            extract_named_author_from_binary_line(
2047                "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2048            ),
2049            Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2050        );
2051    }
2052
2053    #[test]
2054    fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2055        assert_eq!(
2056            extract_named_author_from_binary_line(
2057                "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2058            ),
2059            None
2060        );
2061    }
2062
2063    #[test]
2064    fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2065        assert_eq!(
2066            extract_named_author_from_binary_line(
2067                "Changes as per initial review by panemade@gmail.com"
2068            ),
2069            Some("panemade@gmail.com".to_string())
2070        );
2071    }
2072
2073    #[test]
2074    fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2075        assert!(!is_binary_string_author_candidate(
2076            "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2077        ));
2078    }
2079
2080    #[test]
2081    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2082        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2083        let text = "alpha MIT omega";
2084        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2085        let mut detection = make_detection("");
2086        detection.matches[0].coordinates =
2087            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2088        detection.matches[0].start_token = 1;
2089        detection.matches[0].end_token = 2;
2090
2091        let percentage = compute_percentage_of_license_text(&query, &[detection]);
2092
2093        assert_eq!(percentage, 33.33);
2094    }
2095
2096    #[test]
2097    fn test_scan_options_fingerprint_changes_with_license_score() {
2098        let text_options = crate::scanner::TextDetectionOptions::default();
2099        let default_fingerprint = scan_options_fingerprint(
2100            &text_options,
2101            LicenseScanOptions {
2102                min_score: 0,
2103                ..LicenseScanOptions::default()
2104            },
2105            None,
2106        );
2107        let filtered_fingerprint = scan_options_fingerprint(
2108            &text_options,
2109            LicenseScanOptions {
2110                min_score: 70,
2111                ..LicenseScanOptions::default()
2112            },
2113            None,
2114        );
2115
2116        assert_ne!(default_fingerprint, filtered_fingerprint);
2117    }
2118
2119    #[test]
2120    fn test_is_go_non_production_source_for_test_filename() {
2121        let temp_dir = tempdir().unwrap();
2122        let path = temp_dir.path().join("scanner_test.go");
2123        fs::write(&path, "package scanner\n").unwrap();
2124
2125        assert!(is_go_non_production_source(&path).unwrap());
2126    }
2127
2128    #[test]
2129    fn test_is_go_non_production_source_for_build_tag() {
2130        let temp_dir = tempdir().unwrap();
2131        let path = temp_dir.path().join("scanner.go");
2132        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2133
2134        assert!(is_go_non_production_source(&path).unwrap());
2135    }
2136
2137    #[test]
2138    fn test_is_go_non_production_source_for_regular_go_file() {
2139        let temp_dir = tempdir().unwrap();
2140        let path = temp_dir.path().join("scanner.go");
2141        fs::write(&path, "package scanner\n").unwrap();
2142
2143        assert!(!is_go_non_production_source(&path).unwrap());
2144    }
2145}
provenant/scanner/process.rs

provenant/scanner/
process.rs