Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3    is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
8use crate::utils::text::{
9    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
10};
11use anyhow::Error;
12use rayon::prelude::*;
13use std::collections::HashSet;
14use std::fs::{self, File};
15use std::io::{Read, Write};
16use std::path::Path;
17use std::sync::Arc;
18use std::time::{Duration, Instant};
19
20use crate::copyright::{
21    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
22};
23use crate::finder::{self, DetectionConfig};
24use crate::license_detection::PositionSet;
25use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
26use crate::license_detection::query::Query;
27use crate::models::{
28    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
29    LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
30};
31use crate::parsers::utils::split_name_email;
32use crate::progress::ScanProgress;
33use crate::scanner::collect::CollectedPaths;
34use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
35use crate::utils::file::{
36    ExtractedTextKind, augment_license_detection_text, classify_file_info,
37    extract_text_for_detection_with_diagnostics, get_creation_date,
38};
39use crate::utils::generated::generated_code_hints_from_bytes;
40use tempfile::TempDir;
41
42const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
43    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
44    (
45        "-----BEGIN TRUSTED CERTIFICATE-----",
46        "-----END TRUSTED CERTIFICATE-----",
47    ),
48];
49
50pub fn process_collected(
51    collected: &CollectedPaths,
52    progress: Arc<ScanProgress>,
53    license_engine: Option<Arc<LicenseDetectionEngine>>,
54    license_options: LicenseScanOptions,
55    text_options: &TextDetectionOptions,
56) -> ProcessResult {
57    let mut all_files: Vec<FileInfo> = collected
58        .files
59        .par_iter()
60        .map(|(path, metadata)| {
61            let file_entry = process_file(
62                path,
63                metadata,
64                progress.as_ref(),
65                license_engine.clone(),
66                license_options,
67                text_options,
68            );
69            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
70            file_entry
71        })
72        .collect();
73
74    for (path, metadata) in &collected.directories {
75        all_files.push(process_directory(
76            path,
77            metadata,
78            text_options.collect_info,
79            license_engine.is_some(),
80        ));
81    }
82
83    ProcessResult {
84        files: all_files,
85        excluded_count: collected.excluded_count,
86    }
87}
88
89pub fn process_collected_sequential(
90    collected: &CollectedPaths,
91    progress: Arc<ScanProgress>,
92    license_engine: Option<Arc<LicenseDetectionEngine>>,
93    license_options: LicenseScanOptions,
94    text_options: &TextDetectionOptions,
95) -> ProcessResult {
96    let mut all_files: Vec<FileInfo> =
97        Vec::with_capacity(collected.files.len() + collected.directories.len());
98
99    for (path, metadata) in &collected.files {
100        let file_entry = process_file(
101            path,
102            metadata,
103            progress.as_ref(),
104            license_engine.clone(),
105            license_options,
106            text_options,
107        );
108        progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
109        all_files.push(file_entry);
110    }
111
112    for (path, metadata) in &collected.directories {
113        all_files.push(process_directory(
114            path,
115            metadata,
116            text_options.collect_info,
117            license_engine.is_some(),
118        ));
119    }
120
121    ProcessResult {
122        files: all_files,
123        excluded_count: collected.excluded_count,
124    }
125}
126
127pub fn process_collected_with_memory_limit(
128    collected: &CollectedPaths,
129    progress: Arc<ScanProgress>,
130    license_engine: Option<Arc<LicenseDetectionEngine>>,
131    license_options: LicenseScanOptions,
132    text_options: &TextDetectionOptions,
133    max_in_memory: i64,
134) -> ProcessResult {
135    if max_in_memory == 0 {
136        return process_collected(
137            collected,
138            progress,
139            license_engine,
140            license_options,
141            text_options,
142        );
143    }
144
145    let memory_limit = if max_in_memory < 0 {
146        0
147    } else {
148        max_in_memory as usize
149    };
150    let chunk_size = if max_in_memory < 0 {
151        256
152    } else {
153        memory_limit.max(1)
154    };
155
156    let mut retained_files = Vec::new();
157    let mut spill_store = None;
158
159    for chunk in collected.files.chunks(chunk_size) {
160        let processed_chunk: Vec<FileInfo> = chunk
161            .par_iter()
162            .map(|(path, metadata)| {
163                let file_entry = process_file(
164                    path,
165                    metadata,
166                    progress.as_ref(),
167                    license_engine.clone(),
168                    license_options,
169                    text_options,
170                );
171                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
172                file_entry
173            })
174            .collect();
175
176        retain_or_spill_chunk(
177            processed_chunk,
178            &mut retained_files,
179            &mut spill_store,
180            memory_limit,
181        );
182    }
183
184    for (path, metadata) in &collected.directories {
185        let entry = process_directory(
186            path,
187            metadata,
188            text_options.collect_info,
189            license_engine.is_some(),
190        );
191        retain_or_spill_chunk(
192            vec![entry],
193            &mut retained_files,
194            &mut spill_store,
195            memory_limit,
196        );
197    }
198
199    if let Some(spill_store) = spill_store {
200        retained_files.extend(spill_store.load_all());
201    }
202
203    ProcessResult {
204        files: retained_files,
205        excluded_count: collected.excluded_count,
206    }
207}
208
209pub fn process_collected_with_memory_limit_sequential(
210    collected: &CollectedPaths,
211    progress: Arc<ScanProgress>,
212    license_engine: Option<Arc<LicenseDetectionEngine>>,
213    license_options: LicenseScanOptions,
214    text_options: &TextDetectionOptions,
215    max_in_memory: i64,
216) -> ProcessResult {
217    if max_in_memory == 0 {
218        return process_collected_sequential(
219            collected,
220            progress,
221            license_engine,
222            license_options,
223            text_options,
224        );
225    }
226
227    let memory_limit = if max_in_memory < 0 {
228        0
229    } else {
230        max_in_memory as usize
231    };
232    let chunk_size = if max_in_memory < 0 {
233        256
234    } else {
235        memory_limit.max(1)
236    };
237
238    let mut retained_files = Vec::new();
239    let mut spill_store = None;
240
241    for chunk in collected.files.chunks(chunk_size) {
242        let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
243        for (path, metadata) in chunk {
244            let file_entry = process_file(
245                path,
246                metadata,
247                progress.as_ref(),
248                license_engine.clone(),
249                license_options,
250                text_options,
251            );
252            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
253            processed_chunk.push(file_entry);
254        }
255
256        retain_or_spill_chunk(
257            processed_chunk,
258            &mut retained_files,
259            &mut spill_store,
260            memory_limit,
261        );
262    }
263
264    for (path, metadata) in &collected.directories {
265        let entry = process_directory(
266            path,
267            metadata,
268            text_options.collect_info,
269            license_engine.is_some(),
270        );
271        retain_or_spill_chunk(
272            vec![entry],
273            &mut retained_files,
274            &mut spill_store,
275            memory_limit,
276        );
277    }
278
279    if let Some(spill_store) = spill_store {
280        retained_files.extend(spill_store.load_all());
281    }
282
283    ProcessResult {
284        files: retained_files,
285        excluded_count: collected.excluded_count,
286    }
287}
288
289fn retain_or_spill_chunk(
290    chunk: Vec<FileInfo>,
291    retained_files: &mut Vec<FileInfo>,
292    spill_store: &mut Option<FileInfoSpillStore>,
293    memory_limit: usize,
294) {
295    if memory_limit == 0 {
296        spill_store
297            .get_or_insert_with(FileInfoSpillStore::new)
298            .spill(chunk);
299        return;
300    }
301
302    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
303    if remaining_capacity >= chunk.len() && spill_store.is_none() {
304        retained_files.extend(chunk);
305        return;
306    }
307
308    let mut chunk_iter = chunk.into_iter();
309    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
310    let overflow: Vec<FileInfo> = chunk_iter.collect();
311    if !overflow.is_empty() {
312        spill_store
313            .get_or_insert_with(FileInfoSpillStore::new)
314            .spill(overflow);
315    }
316}
317
318struct FileInfoSpillStore {
319    temp_dir: TempDir,
320    batch_index: usize,
321}
322
323impl FileInfoSpillStore {
324    fn new() -> Self {
325        Self {
326            temp_dir: TempDir::new().expect("create spill dir"),
327            batch_index: 0,
328        }
329    }
330
331    fn spill(&mut self, files: Vec<FileInfo>) {
332        let path = self
333            .temp_dir
334            .path()
335            .join(format!("batch-{:06}.json.zst", self.batch_index));
336        self.batch_index += 1;
337
338        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
339        let file = File::create(path).expect("create spill batch file");
340        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
341        encoder
342            .write_all(&payload)
343            .expect("write spilled file batch");
344        encoder.finish().expect("finish spill encoder");
345    }
346
347    fn load_all(self) -> Vec<FileInfo> {
348        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
349            .expect("read spill dir")
350            .filter_map(Result::ok)
351            .map(|entry| entry.path())
352            .collect();
353        paths.sort();
354
355        let mut files = Vec::new();
356        for path in paths {
357            let file = File::open(path).expect("open spill batch");
358            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
359            let mut payload = Vec::new();
360            decoder.read_to_end(&mut payload).expect("read spill batch");
361            let mut batch: Vec<FileInfo> =
362                serde_json::from_slice(&payload).expect("decode spilled file batch");
363            files.append(&mut batch);
364        }
365        files
366    }
367}
368
369fn process_file(
370    path: &Path,
371    metadata: &fs::Metadata,
372    progress: &ScanProgress,
373    license_engine: Option<Arc<LicenseDetectionEngine>>,
374    license_options: LicenseScanOptions,
375    text_options: &TextDetectionOptions,
376) -> FileInfo {
377    let mut scan_errors: Vec<String> = vec![];
378    let mut file_info_builder = FileInfoBuilder::default();
379    let license_enabled = license_engine.is_some();
380
381    let started = Instant::now();
382
383    let mut generated_flag = None;
384    let mut is_source_file = false;
385    match extract_information_from_content(
386        &mut file_info_builder,
387        &mut scan_errors,
388        path,
389        progress,
390        license_engine,
391        license_options,
392        text_options,
393    ) {
394        Ok((is_generated, sha256, is_source)) => {
395            generated_flag = is_generated;
396            is_source_file = is_source;
397            let _ = sha256;
398        }
399        Err(e) => scan_errors.push(e.to_string()),
400    };
401
402    maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
403
404    let mut file_info = file_info_builder
405        .name(path.file_name().unwrap().to_string_lossy().to_string())
406        .base_name(
407            path.file_stem()
408                .unwrap_or_default()
409                .to_string_lossy()
410                .to_string(),
411        )
412        .extension(
413            path.extension()
414                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
415        )
416        .path(path.to_string_lossy().to_string())
417        .file_type(FileType::File)
418        .size(metadata.len())
419        .date(
420            text_options
421                .collect_info
422                .then(|| get_creation_date(metadata))
423                .flatten(),
424        )
425        .scan_errors(scan_errors)
426        .build()
427        .expect("FileInformationBuild not completely initialized");
428
429    if text_options.collect_info {
430        file_info.is_source = Some(is_source_file);
431    }
432
433    if file_info.programming_language.as_deref() == Some("Go")
434        && is_go_non_production_source(path).unwrap_or(false)
435    {
436        file_info.is_source = Some(false);
437    }
438
439    if text_options.detect_generated {
440        file_info.is_generated = Some(generated_flag.unwrap_or(false));
441    }
442
443    if file_info.percentage_of_license_text.is_none() && license_enabled {
444        file_info.percentage_of_license_text = Some(0.0);
445    }
446
447    file_info
448}
449
450fn extract_information_from_content(
451    file_info_builder: &mut FileInfoBuilder,
452    scan_errors: &mut Vec<String>,
453    path: &Path,
454    progress: &ScanProgress,
455    license_engine: Option<Arc<LicenseDetectionEngine>>,
456    license_options: LicenseScanOptions,
457    text_options: &TextDetectionOptions,
458) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
459    let started = Instant::now();
460    let filesystem_path = absolute_filesystem_path(path);
461    let buffer = fs::read(&filesystem_path)?;
462    let license_enabled = license_engine.is_some();
463
464    if is_timeout_exceeded(started, text_options.timeout_seconds) {
465        return Err(Error::msg(format!(
466            "Timeout while reading file content (> {:.2}s)",
467            text_options.timeout_seconds
468        )));
469    }
470
471    let sha256 = calculate_sha256(&buffer);
472    let is_generated = text_options
473        .detect_generated
474        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
475    let classification = classify_file_info(&filesystem_path, &buffer);
476
477    if text_options.collect_info {
478        file_info_builder
479            .sha1(Some(calculate_sha1(&buffer)))
480            .md5(Some(calculate_md5(&buffer)))
481            .sha256(Some(sha256))
482            .programming_language(classification.programming_language.clone())
483            .mime_type(Some(classification.mime_type.clone()))
484            .file_type_label(Some(classification.file_type.clone()))
485            .sha1_git(Some(calculate_sha1_git(&buffer)))
486            .is_binary(Some(classification.is_binary))
487            .is_text(Some(classification.is_text))
488            .is_archive(Some(classification.is_archive))
489            .is_media(Some(classification.is_media))
490            .is_source(Some(classification.is_source))
491            .is_script(Some(classification.is_script))
492            .files_count(Some(0))
493            .dirs_count(Some(0))
494            .size_count(Some(0));
495    }
496
497    if should_skip_text_detection(&filesystem_path, &buffer) {
498        return Ok((is_generated, sha256, classification.is_source));
499    }
500
501    // Package parsing and text-based detection (copyright, license) are independent.
502    // Python ScanCode runs all enabled plugins on every file, so we do the same.
503    if text_options.detect_packages {
504        let started = Instant::now();
505        let parse_result = try_parse_file(&filesystem_path)
506            .or_else(|| {
507                text_options
508                    .detect_application_packages
509                    .then(|| try_parse_windows_executable_bytes(&filesystem_path, &buffer))
510                    .flatten()
511            })
512            .or_else(|| {
513                text_options
514                    .detect_packages_in_compiled
515                    .then(|| {
516                        (classification.is_binary && is_supported_compiled_binary_format(&buffer))
517                            .then(|| try_parse_compiled_bytes(&buffer))
518                            .flatten()
519                    })
520                    .flatten()
521            });
522
523        if let Some(parse_result) = parse_result {
524            let packages = parse_result
525                .packages
526                .into_iter()
527                .filter(|package| {
528                    let is_compiled_package = package
529                        .datasource_id
530                        .as_ref()
531                        .is_some_and(is_compiled_datasource);
532                    let is_system_package = package
533                        .datasource_id
534                        .as_ref()
535                        .is_some_and(is_system_datasource);
536                    if is_compiled_package {
537                        text_options.detect_packages_in_compiled
538                    } else if is_system_package {
539                        text_options.detect_system_packages
540                    } else {
541                        text_options.detect_application_packages
542                    }
543                })
544                .collect();
545            file_info_builder.package_data(packages);
546            scan_errors.extend(parse_result.scan_errors);
547        }
548        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
549    }
550
551    if is_timeout_exceeded(started, text_options.timeout_seconds) {
552        return Err(Error::msg(format!(
553            "Timeout while extracting package/text metadata (> {:.2}s)",
554            text_options.timeout_seconds
555        )));
556    }
557
558    let (text_content, text_kind, text_scan_error) =
559        extract_text_for_detection_with_diagnostics(&filesystem_path, &buffer);
560    if let Some(text_scan_error) = text_scan_error {
561        scan_errors.push(text_scan_error);
562    }
563    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
564
565    if is_timeout_exceeded(started, text_options.timeout_seconds) {
566        return Err(Error::msg(format!(
567            "Timeout while extracting text content (> {:.2}s)",
568            text_options.timeout_seconds
569        )));
570    }
571
572    if text_content.is_empty() {
573        return Ok((is_generated, sha256, classification.is_source));
574    }
575
576    if text_options.detect_copyrights {
577        extract_copyright_information(
578            file_info_builder,
579            path,
580            &text_content,
581            text_options.timeout_seconds,
582            from_binary_strings,
583        );
584    }
585    extract_email_url_information(
586        file_info_builder,
587        &text_content,
588        text_options,
589        from_binary_strings,
590    );
591
592    if is_timeout_exceeded(started, text_options.timeout_seconds) {
593        return Err(Error::msg(format!(
594            "Timeout before license scan (> {:.2}s)",
595            text_options.timeout_seconds
596        )));
597    }
598    // Handle source map files specially
599    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
600        if let Some(sourcemap_content) =
601            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
602        {
603            sourcemap_content
604        } else {
605            text_content
606        }
607    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
608        remove_verbatim_escape_sequences(&text_content)
609    } else {
610        text_content
611    };
612    let text_content_for_license_detection =
613        augment_license_detection_text(path, &text_content_for_license_detection);
614    let text_content_for_license_detection = text_content_for_license_detection.into_owned();
615
616    if license_enabled {
617        let started = Instant::now();
618        extract_license_information(
619            file_info_builder,
620            scan_errors,
621            &filesystem_path,
622            text_content_for_license_detection.clone(),
623            license_engine,
624            license_options,
625            from_binary_strings,
626        )?;
627        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
628    } else {
629        extract_license_information(
630            file_info_builder,
631            scan_errors,
632            &filesystem_path,
633            text_content_for_license_detection,
634            license_engine,
635            license_options,
636            from_binary_strings,
637        )?;
638    }
639
640    Ok((is_generated, sha256, classification.is_source))
641}
642
643fn absolute_filesystem_path(path: &Path) -> std::path::PathBuf {
644    if path.is_absolute() {
645        return path.to_path_buf();
646    }
647
648    std::env::current_dir()
649        .map(|cwd| cwd.join(path))
650        .unwrap_or_else(|_| path.to_path_buf())
651}
652
653fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
654    timeout_seconds.is_finite()
655        && timeout_seconds > 0.0
656        && started.elapsed().as_secs_f64() > timeout_seconds
657}
658
659fn maybe_record_processing_timeout(
660    scan_errors: &mut Vec<String>,
661    started: Instant,
662    timeout_seconds: f64,
663) {
664    if is_timeout_exceeded(started, timeout_seconds)
665        && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
666    {
667        scan_errors.push(format!(
668            "Processing interrupted due to timeout after {:.2} seconds",
669            timeout_seconds
670        ));
671    }
672}
673
674fn is_timeout_scan_error(error: &str) -> bool {
675    error.contains("Timeout while ")
676        || error.contains("Timeout before ")
677        || error.contains("Processing interrupted due to timeout")
678}
679
680fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
681    matches!(
682        datasource_id,
683        DatasourceId::AlpineInstalledDb
684            | DatasourceId::DebianDistrolessInstalledDb
685            | DatasourceId::DebianInstalledFilesList
686            | DatasourceId::DebianInstalledMd5Sums
687            | DatasourceId::DebianInstalledStatusDb
688            | DatasourceId::FreebsdCompactManifest
689            | DatasourceId::RpmInstalledDatabaseBdb
690            | DatasourceId::RpmInstalledDatabaseNdb
691            | DatasourceId::RpmInstalledDatabaseSqlite
692            | DatasourceId::RpmYumdb
693    )
694}
695
696fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
697    matches!(
698        datasource_id,
699        DatasourceId::GoBinary | DatasourceId::RustBinary
700    )
701}
702
703fn extract_copyright_information(
704    file_info_builder: &mut FileInfoBuilder,
705    path: &Path,
706    text_content: &str,
707    timeout_seconds: f64,
708    from_binary_strings: bool,
709) {
710    // CREDITS files get special handling (Linux kernel style).
711    if copyright::is_credits_file(path) {
712        let author_detections = copyright::detect_credits_authors(text_content);
713        if !author_detections.is_empty() {
714            file_info_builder.authors(
715                author_detections
716                    .into_iter()
717                    .map(|a| Author {
718                        author: a.author,
719                        start_line: a.start_line,
720                        end_line: a.end_line,
721                    })
722                    .collect(),
723            );
724            return;
725        }
726    }
727
728    let copyright_options = CopyrightDetectionOptions {
729        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
730            Some(Duration::from_secs_f64(timeout_seconds))
731        } else {
732            None
733        },
734        ..CopyrightDetectionOptions::default()
735    };
736
737    let (copyrights, holders, authors) =
738        copyright::detect_copyrights_with_options(text_content, &copyright_options);
739    let (copyrights, holders, authors) = if from_binary_strings {
740        prune_binary_string_detections(text_content, copyrights, holders, authors)
741    } else {
742        (copyrights, holders, authors)
743    };
744
745    file_info_builder.copyrights(
746        copyrights
747            .into_iter()
748            .map(|c| Copyright {
749                copyright: c.copyright,
750                start_line: c.start_line,
751                end_line: c.end_line,
752            })
753            .collect::<Vec<Copyright>>(),
754    );
755    file_info_builder.holders(
756        holders
757            .into_iter()
758            .map(|h| Holder {
759                holder: h.holder,
760                start_line: h.start_line,
761                end_line: h.end_line,
762            })
763            .collect::<Vec<Holder>>(),
764    );
765    file_info_builder.authors(
766        authors
767            .into_iter()
768            .map(|a| Author {
769                author: a.author,
770                start_line: a.start_line,
771                end_line: a.end_line,
772            })
773            .collect::<Vec<Author>>(),
774    );
775}
776
777fn prune_binary_string_detections(
778    text_content: &str,
779    copyrights: Vec<CopyrightDetection>,
780    holders: Vec<HolderDetection>,
781    authors: Vec<AuthorDetection>,
782) -> (
783    Vec<CopyrightDetection>,
784    Vec<HolderDetection>,
785    Vec<AuthorDetection>,
786) {
787    let kept_copyrights: Vec<CopyrightDetection> = copyrights
788        .into_iter()
789        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
790        .collect();
791
792    let kept_holders: Vec<HolderDetection> = holders
793        .into_iter()
794        .filter(|holder| {
795            kept_copyrights.iter().any(|copyright| {
796                ranges_overlap(
797                    holder.start_line,
798                    holder.end_line,
799                    copyright.start_line,
800                    copyright.end_line,
801                )
802            })
803        })
804        .collect();
805
806    let kept_authors = authors
807        .into_iter()
808        .filter(|author| is_binary_string_author_candidate(&author.author))
809        .chain(extract_binary_string_author_supplements(text_content))
810        .filter({
811            let mut seen = HashSet::new();
812            move |author| seen.insert(author.author.clone())
813        })
814        .collect();
815
816    (kept_copyrights, kept_holders, kept_authors)
817}
818
819fn ranges_overlap(
820    a_start: LineNumber,
821    a_end: LineNumber,
822    b_start: LineNumber,
823    b_end: LineNumber,
824) -> bool {
825    a_start <= b_end && b_start <= a_end
826}
827
828fn is_binary_string_copyright_candidate(text: &str) -> bool {
829    if contains_year(text) {
830        return true;
831    }
832
833    let trimmed = text.trim();
834    let lower = trimmed.to_ascii_lowercase();
835    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
836        tail.trim()
837    } else {
838        lower.trim()
839    };
840    let original_tail = if lower.starts_with("copyright") {
841        trimmed["copyright".len()..].trim()
842    } else {
843        trimmed
844    };
845
846    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
847        return false;
848    }
849
850    let alpha_tokens: Vec<&str> = tail
851        .split_whitespace()
852        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
853        .collect();
854
855    if alpha_tokens.len() <= 1 {
856        return has_explicit_copyright_marker(text)
857            && alpha_tokens.iter().any(|token| {
858                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
859            });
860    }
861
862    if !has_explicit_copyright_marker(text) {
863        return false;
864    }
865
866    has_binary_name_like_shape(original_tail)
867}
868
869fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
870    let mut authors = Vec::new();
871
872    for (line_index, line) in text_content.lines().enumerate() {
873        if let Some(author) = extract_named_author_from_binary_line(line) {
874            authors.push(AuthorDetection {
875                author,
876                start_line: LineNumber::from_0_indexed(line_index),
877                end_line: LineNumber::from_0_indexed(line_index),
878            });
879        }
880    }
881
882    authors
883}
884
885fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
886    let line = line.trim();
887    if line.is_empty() {
888        return None;
889    }
890
891    let emails = finder::find_emails(
892        line,
893        &DetectionConfig {
894            max_emails: 4,
895            max_urls: 0,
896            unique: false,
897        },
898    );
899    let email = emails.first()?.email.as_str();
900    if !is_binary_string_email_candidate(email) {
901        return None;
902    }
903
904    let lower_line = line.to_ascii_lowercase();
905    let email_start = lower_line.find(email)?;
906    let raw_prefix = &line[..email_start];
907    let has_author_marker = contains_binary_author_marker(raw_prefix);
908    let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
909    let prefix = prefix
910        .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
911        .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
912        .trim();
913
914    let (name, _) = split_name_email(prefix);
915    let name = name.or_else(|| {
916        let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
917        (!trimmed.is_empty()).then(|| trimmed.to_string())
918    });
919
920    let Some(name) = name.map(|name| name.trim().to_string()) else {
921        if has_author_marker {
922            return Some(email.to_string());
923        }
924        return None;
925    };
926
927    if name.is_empty() && has_author_marker {
928        return Some(email.to_string());
929    }
930
931    if !has_binary_name_like_shape(&name) {
932        return None;
933    }
934
935    if line.contains(&format!("<{email}>")) {
936        Some(format!("{name} <{email}>"))
937    } else if line.contains(&format!("({email})")) {
938        Some(format!("{name} ({email})"))
939    } else {
940        Some(format!("{name} {email}"))
941    }
942}
943
944fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
945    let lower = text.to_ascii_lowercase();
946    let idx = lower.rfind(marker)?;
947    Some(text[idx + marker.len()..].trim())
948}
949
950fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
951    const MARKERS: &[&str] = &[
952        " patch author: ",
953        " patch author ",
954        " written by ",
955        " contributed by ",
956        " original work done by ",
957        " work done by ",
958        " thanks to ",
959        " review by ",
960        " by ",
961        " from ",
962    ];
963
964    MARKERS
965        .iter()
966        .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
967        .next()
968}
969
970fn contains_binary_author_marker(text: &str) -> bool {
971    take_suffix_after_last_author_marker(text).is_some()
972}
973
974fn has_binary_name_like_shape(text: &str) -> bool {
975    let trimmed = text.trim();
976    if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
977    {
978        return false;
979    }
980
981    let tokens: Vec<&str> = trimmed
982        .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
983        .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
984        .collect();
985    if tokens.is_empty() {
986        return false;
987    }
988
989    let uppercase_like = tokens
990        .iter()
991        .filter(|token| {
992            let token = token.trim_matches('.');
993            token
994                .chars()
995                .find(|c| c.is_ascii_alphabetic())
996                .is_some_and(|c| c.is_ascii_uppercase())
997        })
998        .count();
999
1000    uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
1001        || tokens
1002            .iter()
1003            .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
1004}
1005
1006fn has_sufficient_alphabetic_content(text: &str) -> bool {
1007    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
1008    if alnum_count == 0 {
1009        return false;
1010    }
1011
1012    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1013    alpha_count * 2 >= alnum_count
1014}
1015
1016fn has_excessive_at_noise(text: &str) -> bool {
1017    text.chars().filter(|c| *c == '@').count() >= 3
1018}
1019
1020fn has_explicit_copyright_marker(text: &str) -> bool {
1021    let lower = text.to_ascii_lowercase();
1022    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1023}
1024
1025fn contains_year(text: &str) -> bool {
1026    let bytes = text.as_bytes();
1027    bytes.windows(4).any(|window| {
1028        window.iter().all(|b| b.is_ascii_digit())
1029            && matches!(window[0], b'1' | b'2')
1030            && matches!(window[1], b'9' | b'0')
1031    })
1032}
1033
1034fn is_company_like_suffix(token: &str) -> bool {
1035    matches!(
1036        token.to_ascii_lowercase().as_str(),
1037        "inc"
1038            | "corp"
1039            | "corporation"
1040            | "co"
1041            | "company"
1042            | "ltd"
1043            | "llc"
1044            | "gmbh"
1045            | "foundation"
1046            | "project"
1047            | "systems"
1048            | "software"
1049            | "technologies"
1050            | "technology"
1051    )
1052}
1053
1054fn extract_email_url_information(
1055    file_info_builder: &mut FileInfoBuilder,
1056    text_content: &str,
1057    text_options: &TextDetectionOptions,
1058    from_binary_strings: bool,
1059) {
1060    if !text_options.detect_emails && !text_options.detect_urls {
1061        return;
1062    }
1063
1064    if text_options.detect_emails {
1065        let config = DetectionConfig {
1066            max_emails: text_options.max_emails,
1067            max_urls: text_options.max_urls,
1068            unique: from_binary_strings,
1069        };
1070        let emails = finder::find_emails(text_content, &config)
1071            .into_iter()
1072            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1073            .map(|d| OutputEmail {
1074                email: d.email,
1075                start_line: d.start_line,
1076                end_line: d.end_line,
1077            })
1078            .collect::<Vec<_>>();
1079        file_info_builder.emails(emails);
1080    }
1081
1082    if text_options.detect_urls {
1083        let config = DetectionConfig {
1084            max_emails: text_options.max_emails,
1085            max_urls: text_options.max_urls,
1086            unique: true,
1087        };
1088        let urls = finder::find_urls(text_content, &config)
1089            .into_iter()
1090            .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
1091            .map(|d| OutputURL {
1092                url: d.url,
1093                start_line: d.start_line,
1094                end_line: d.end_line,
1095            })
1096            .collect::<Vec<_>>();
1097        file_info_builder.urls(urls);
1098    }
1099}
1100
1101fn is_binary_string_email_candidate(email: &str) -> bool {
1102    let Some((local, domain)) = email.rsplit_once('@') else {
1103        return false;
1104    };
1105
1106    if !has_strong_binary_local_part(local) {
1107        return false;
1108    }
1109
1110    has_strong_binary_host_shape(domain)
1111}
1112
1113fn is_binary_string_url_candidate(url: &str) -> bool {
1114    let parsed = url::Url::parse(url).ok();
1115    let Some(parsed) = parsed else {
1116        return false;
1117    };
1118    let Some(host) = parsed.host_str() else {
1119        return false;
1120    };
1121
1122    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1123}
1124
1125fn is_binary_string_author_candidate(author: &str) -> bool {
1126    let trimmed = author.trim();
1127    if trimmed.is_empty()
1128        || !has_sufficient_alphabetic_content(trimmed)
1129        || has_excessive_at_noise(trimmed)
1130    {
1131        return false;
1132    }
1133
1134    if trimmed.contains('@') {
1135        let emails = finder::find_emails(
1136            trimmed,
1137            &DetectionConfig {
1138                max_emails: 4,
1139                max_urls: 0,
1140                unique: true,
1141            },
1142        );
1143        if emails.len() > 1 {
1144            return false;
1145        }
1146
1147        if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1148            return !extracted.is_empty();
1149        }
1150
1151        let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1152            return false;
1153        };
1154        if !is_binary_string_email_candidate(email) {
1155            return false;
1156        }
1157
1158        let (name, _) = split_name_email(trimmed);
1159        return name.as_deref().is_some_and(has_binary_name_like_shape);
1160    }
1161
1162    has_binary_name_like_shape(trimmed)
1163}
1164
1165fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1166    if parsed.path() != "/"
1167        && parsed
1168            .path()
1169            .split('/')
1170            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1171    {
1172        return true;
1173    }
1174
1175    if parsed.query().is_some() || parsed.fragment().is_some() {
1176        return true;
1177    }
1178
1179    let Some(host) = parsed.host_str() else {
1180        return false;
1181    };
1182
1183    let labels: Vec<&str> = host.split('.').collect();
1184    if labels.len() > 2 {
1185        return labels[..labels.len() - 1].iter().any(|label| {
1186            label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1187        });
1188    }
1189
1190    if matches!(labels.first(), Some(&"www")) {
1191        return true;
1192    }
1193
1194    if labels.len() == 2 {
1195        let domain = labels[0];
1196        let tld = labels[1];
1197        if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1198            return true;
1199        }
1200    }
1201
1202    labels
1203        .iter()
1204        .take(labels.len().saturating_sub(1))
1205        .any(|label| {
1206            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1207        })
1208}
1209
1210fn has_strong_binary_local_part(local: &str) -> bool {
1211    local
1212        .split(|c: char| !c.is_ascii_alphabetic())
1213        .any(|segment| segment.len() >= 3)
1214}
1215
1216fn has_strong_binary_host_shape(host: &str) -> bool {
1217    let labels: Vec<&str> = host.split('.').collect();
1218    if labels.len() < 2 {
1219        return false;
1220    }
1221
1222    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1223        &labels[1..]
1224    } else {
1225        &labels[..]
1226    };
1227
1228    if relevant.len() < 2 {
1229        return false;
1230    }
1231
1232    relevant[..relevant.len() - 1].iter().any(|label| {
1233        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1234    })
1235}
1236
1237fn extract_license_information(
1238    file_info_builder: &mut FileInfoBuilder,
1239    scan_errors: &mut Vec<String>,
1240    path: &Path,
1241    text_content: String,
1242    license_engine: Option<Arc<LicenseDetectionEngine>>,
1243    license_options: LicenseScanOptions,
1244    from_binary_strings: bool,
1245) -> Result<(), Error> {
1246    let Some(engine) = license_engine else {
1247        return Ok(());
1248    };
1249
1250    let detection_result = if license_options.min_score == 0 {
1251        engine.detect_with_kind_and_source(
1252            &text_content,
1253            license_options.unknown_licenses,
1254            from_binary_strings,
1255            &path.to_string_lossy(),
1256        )
1257    } else {
1258        engine.detect_with_kind_and_source_with_score(
1259            &text_content,
1260            license_options.unknown_licenses,
1261            from_binary_strings,
1262            &path.to_string_lossy(),
1263            license_options.min_score as f32,
1264        )
1265    };
1266
1267    match detection_result {
1268        Ok(detections) => {
1269            let query =
1270                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1271            let mut model_detections = Vec::new();
1272            let mut model_clues = Vec::new();
1273
1274            for detection in &detections {
1275                let (public_detection, clue_matches) = convert_detection_to_model(
1276                    detection,
1277                    license_options,
1278                    &text_content,
1279                    query.as_ref(),
1280                );
1281
1282                if let Some(public_detection) = public_detection {
1283                    model_detections.push(public_detection);
1284                }
1285
1286                model_clues.extend(clue_matches);
1287            }
1288
1289            if !model_detections.is_empty() {
1290                let expressions: Vec<String> = model_detections
1291                    .iter()
1292                    .filter(|d| !d.license_expression_spdx.is_empty())
1293                    .map(|d| d.license_expression_spdx.clone())
1294                    .collect();
1295
1296                if !expressions.is_empty() {
1297                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
1298                    if let Some(expr) = combined {
1299                        file_info_builder.license_expression(Some(expr));
1300                    }
1301                }
1302            }
1303
1304            file_info_builder.license_detections(model_detections);
1305            file_info_builder.license_clues(model_clues);
1306            file_info_builder.percentage_of_license_text(
1307                query
1308                    .as_ref()
1309                    .map(|query| compute_percentage_of_license_text(query, &detections)),
1310            );
1311        }
1312        Err(e) => {
1313            scan_errors.push(format!("License detection failed: {}", e));
1314        }
1315    }
1316
1317    Ok(())
1318}
1319
1320fn convert_detection_to_model(
1321    detection: &crate::license_detection::LicenseDetection,
1322    license_options: LicenseScanOptions,
1323    text_content: &str,
1324    query: Option<&Query<'_>>,
1325) -> (Option<LicenseDetection>, Vec<Match>) {
1326    let matches: Vec<Match> = detection
1327        .matches
1328        .iter()
1329        .map(|m| convert_match_to_model(m, license_options, text_content, query))
1330        .collect();
1331
1332    if let Some(license_expression) = detection.license_expression.clone() {
1333        (
1334            Some(LicenseDetection {
1335                license_expression,
1336                license_expression_spdx: detection
1337                    .license_expression_spdx
1338                    .clone()
1339                    .unwrap_or_default(),
1340                matches,
1341                detection_log: if license_options.include_diagnostics {
1342                    detection.detection_log.clone()
1343                } else {
1344                    Vec::new()
1345                },
1346                identifier: detection.identifier.clone(),
1347            }),
1348            Vec::new(),
1349        )
1350    } else {
1351        (None, matches)
1352    }
1353}
1354
1355fn convert_match_to_model(
1356    m: &crate::license_detection::models::LicenseMatch,
1357    license_options: LicenseScanOptions,
1358    text_content: &str,
1359    query: Option<&Query<'_>>,
1360) -> Match {
1361    let rule_url = if m.rule_url.is_empty() {
1362        None
1363    } else {
1364        Some(m.rule_url.clone())
1365    };
1366    let matched_text = if license_options.include_text {
1367        m.matched_text.clone().or_else(|| {
1368            Some(crate::license_detection::query::matched_text_from_text(
1369                text_content,
1370                m.start_line.get(),
1371                m.end_line.get(),
1372            ))
1373        })
1374    } else {
1375        None
1376    };
1377    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1378        query.map(|query| matched_text_diagnostics_from_match(query, m))
1379    } else {
1380        None
1381    };
1382    Match {
1383        license_expression: m.license_expression.clone(),
1384        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1385        from_file: m.from_file.clone(),
1386        start_line: m.start_line,
1387        end_line: m.end_line,
1388        matcher: Some(m.matcher.to_string()),
1389        score: m.score,
1390        matched_length: Some(m.matched_length),
1391        match_coverage: Some(((m.coverage() as f64) * 100.0).round() / 100.0),
1392        rule_relevance: Some(m.rule_relevance),
1393        rule_identifier: Some(m.rule_identifier.clone()),
1394        rule_url,
1395        matched_text,
1396        referenced_filenames: m.referenced_filenames.clone(),
1397        matched_text_diagnostics,
1398    }
1399}
1400
1401fn compute_percentage_of_license_text(
1402    query: &Query<'_>,
1403    detections: &[crate::license_detection::LicenseDetection],
1404) -> f64 {
1405    let matched_positions: std::collections::HashSet<usize> = detections
1406        .iter()
1407        .flat_map(|detection| detection.matches.iter())
1408        .flat_map(|m| m.query_span().iter())
1409        .collect();
1410
1411    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1412    if query_tokens_length == 0 {
1413        return 0.0;
1414    }
1415
1416    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1417    (percentage * 100.0).round() / 100.0
1418}
1419
1420fn matched_text_diagnostics_from_match(
1421    query: &Query<'_>,
1422    license_match: &InternalLicenseMatch,
1423) -> String {
1424    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1425    let Some(start_pos) = matched_positions.iter().min() else {
1426        return crate::license_detection::query::matched_text_from_text(
1427            &query.text,
1428            license_match.start_line.get(),
1429            license_match.end_line.get(),
1430        );
1431    };
1432    let Some(end_pos) = matched_positions.iter().max() else {
1433        return crate::license_detection::query::matched_text_from_text(
1434            &query.text,
1435            license_match.start_line.get(),
1436            license_match.end_line.get(),
1437        );
1438    };
1439
1440    crate::license_detection::query::matched_text_diagnostics_from_text(
1441        &query.text,
1442        query,
1443        &matched_positions,
1444        start_pos,
1445        end_pos,
1446        license_match.start_line.get(),
1447        license_match.end_line.get(),
1448    )
1449}
1450
1451fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1452    is_pem_certificate_file(path, buffer)
1453}
1454
1455fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1456    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1457        return Ok(false);
1458    }
1459
1460    if path
1461        .file_name()
1462        .and_then(|name| name.to_str())
1463        .is_some_and(|name| name.ends_with("_test.go"))
1464    {
1465        return Ok(true);
1466    }
1467
1468    let content = fs::read_to_string(path)?;
1469    Ok(content.lines().take(10).any(|line| {
1470        let trimmed = line.trim();
1471        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1472            && trimmed.split_whitespace().any(|token| token == "test")
1473    }))
1474}
1475
1476fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1477    let prefix_len = buffer.len().min(8192);
1478    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1479    let trimmed_lines: Vec<&str> = prefix
1480        .lines()
1481        .map(str::trim)
1482        .filter(|line| !line.is_empty())
1483        .take(64)
1484        .collect();
1485
1486    let Some(first_line) = trimmed_lines.first().copied() else {
1487        return false;
1488    };
1489
1490    PEM_CERTIFICATE_HEADERS
1491        .iter()
1492        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1493}
1494
1495fn process_directory(
1496    path: &Path,
1497    _metadata: &fs::Metadata,
1498    collect_info: bool,
1499    license_enabled: bool,
1500) -> FileInfo {
1501    let name = path
1502        .file_name()
1503        .unwrap_or_default()
1504        .to_string_lossy()
1505        .to_string();
1506    let base_name = name.clone(); // For directories, base_name is the same as name
1507
1508    FileInfo {
1509        name,
1510        base_name,
1511        extension: "".to_string(),
1512        path: path.to_string_lossy().to_string(),
1513        file_type: FileType::Directory,
1514        mime_type: None,
1515        file_type_label: None,
1516        size: 0,
1517        date: None,
1518        sha1: None,
1519        md5: None,
1520        sha256: None,
1521        sha1_git: None,
1522        programming_language: None,
1523        package_data: Vec::new(),
1524        license_expression: None,
1525        license_detections: Vec::new(),
1526        license_clues: Vec::new(),
1527        percentage_of_license_text: license_enabled.then_some(0.0),
1528        copyrights: Vec::new(),
1529        holders: Vec::new(),
1530        authors: Vec::new(),
1531        emails: Vec::new(),
1532        urls: Vec::new(),
1533        for_packages: Vec::new(),
1534        scan_errors: Vec::new(),
1535        license_policy: None,
1536        is_binary: collect_info.then_some(false),
1537        is_text: collect_info.then_some(false),
1538        is_archive: collect_info.then_some(false),
1539        is_media: collect_info.then_some(false),
1540        is_source: collect_info.then_some(false),
1541        is_script: collect_info.then_some(false),
1542        files_count: collect_info.then_some(0),
1543        dirs_count: collect_info.then_some(0),
1544        size_count: collect_info.then_some(0),
1545        source_count: None,
1546        is_legal: false,
1547        is_manifest: false,
1548        is_readme: false,
1549        is_top_level: false,
1550        is_key_file: false,
1551        is_community: false,
1552        is_generated: None,
1553        facets: vec![],
1554        tallies: None,
1555    }
1556}
1557
1558#[cfg(test)]
1559mod tests {
1560    use super::{
1561        compute_percentage_of_license_text, convert_detection_to_model,
1562        extract_email_url_information, extract_named_author_from_binary_line,
1563        is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1564        is_binary_string_email_candidate, is_binary_string_url_candidate,
1565        is_go_non_production_source, process_file,
1566    };
1567    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1568    use crate::license_detection::index::LicenseIndex;
1569    use crate::license_detection::index::dictionary::TokenDictionary;
1570    use crate::license_detection::models::position_span::PositionSpan;
1571    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1572    use crate::license_detection::query::Query;
1573    use crate::models::{FileInfoBuilder, FileType, MatchScore};
1574    use crate::progress::{ProgressMode, ScanProgress};
1575    use crate::scanner::scan_options_fingerprint;
1576    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1577    use std::fs;
1578    use std::time::{Duration, Instant};
1579    use tempfile::tempdir;
1580
1581    use super::maybe_record_processing_timeout;
1582
1583    use crate::models::LineNumber;
1584
1585    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1586        LicenseMatch {
1587            rid: 0,
1588            license_expression: "mit".to_string(),
1589            license_expression_spdx: Some("MIT".to_string()),
1590            from_file: None,
1591            start_line: LineNumber::ONE,
1592            end_line: LineNumber::ONE,
1593            start_token: 0,
1594            end_token: 1,
1595            matcher: MatcherKind::Hash,
1596            score: MatchScore::from_percentage(1.0),
1597            matched_length: 3,
1598            rule_length: 3,
1599            match_coverage: 100.0,
1600            rule_relevance: 100,
1601            rule_identifier: "mit.LICENSE".to_string(),
1602            rule_url: rule_url.to_string(),
1603            matched_text: Some("MIT".to_string()),
1604            referenced_filenames: None,
1605            rule_kind: RuleKind::Text,
1606            is_from_license: true,
1607            rule_start_token: 0,
1608            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1609            candidate_resemblance: 0.0,
1610            candidate_containment: 0.0,
1611        }
1612    }
1613
1614    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1615        InternalLicenseDetection {
1616            license_expression: Some("mit".to_string()),
1617            license_expression_spdx: Some("MIT".to_string()),
1618            matches: vec![make_internal_match(rule_url)],
1619            detection_log: vec![],
1620            identifier: Some("mit-test".to_string()),
1621            file_regions: Vec::new(),
1622        }
1623    }
1624
1625    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1626        let dictionary = TokenDictionary::new_with_legalese(entries);
1627        let mut index = LicenseIndex::new(dictionary);
1628        index.len_legalese = len_legalese;
1629        index
1630    }
1631
1632    #[test]
1633    fn test_convert_detection_to_model_preserves_rule_url() {
1634        let detection = make_detection(
1635            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1636        );
1637
1638        let (converted, clues) =
1639            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1640        let converted = converted.expect("detection should convert");
1641
1642        assert_eq!(
1643            converted.matches[0].rule_url.as_deref(),
1644            Some(
1645                "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1646            )
1647        );
1648        assert!(clues.is_empty());
1649    }
1650
1651    #[test]
1652    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1653        let detection = make_detection("");
1654
1655        let (converted, clues) =
1656            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1657        let converted = converted.expect("detection should convert");
1658
1659        assert_eq!(converted.matches[0].rule_url, None);
1660        assert!(clues.is_empty());
1661    }
1662
1663    #[test]
1664    fn test_convert_detection_to_model_rounds_match_coverage() {
1665        let mut detection = make_detection("");
1666        detection.matches[0].score = MatchScore::from_percentage(81.82);
1667        detection.matches[0].match_coverage = 33.334;
1668
1669        let (converted, clues) =
1670            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1671        let converted = converted.expect("detection should convert");
1672
1673        assert_eq!(
1674            converted.matches[0].score,
1675            MatchScore::from_percentage(81.82)
1676        );
1677        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1678        assert!(clues.is_empty());
1679    }
1680
1681    #[test]
1682    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1683        let mut detection = make_detection(
1684            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1685        );
1686        detection.license_expression = None;
1687        detection.license_expression_spdx = None;
1688        detection.identifier = None;
1689        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1690        detection.matches[0].license_expression_spdx =
1691            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1692        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1693        detection.matches[0].rule_kind = RuleKind::Clue;
1694
1695        let (converted, clues) = convert_detection_to_model(
1696            &detection,
1697            LicenseScanOptions {
1698                include_text: true,
1699                min_score: 0,
1700                ..LicenseScanOptions::default()
1701            },
1702            "clue text",
1703            None,
1704        );
1705
1706        assert!(converted.is_none());
1707        assert_eq!(clues.len(), 1);
1708        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1709        assert_eq!(
1710            clues[0].license_expression_spdx,
1711            "LicenseRef-scancode-unknown-license-reference"
1712        );
1713        assert_eq!(
1714            clues[0].rule_identifier.as_deref(),
1715            Some("license-clue_1.RULE")
1716        );
1717        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1718        assert_eq!(clues[0].matched_text_diagnostics, None);
1719    }
1720
1721    #[test]
1722    fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1723        let dir = tempdir().expect("tempdir");
1724        let path = dir.path().join("broken.pdf");
1725        fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1726            .expect("write malformed pdf");
1727        let metadata = fs::metadata(&path).expect("metadata");
1728        let progress = ScanProgress::new(ProgressMode::Quiet);
1729
1730        let file_info = process_file(
1731            &path,
1732            &metadata,
1733            &progress,
1734            None,
1735            LicenseScanOptions::default(),
1736            &TextDetectionOptions::default(),
1737        );
1738
1739        assert!(file_info.scan_errors.is_empty());
1740    }
1741
1742    #[test]
1743    fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1744        let started = Instant::now() - Duration::from_secs(2);
1745        let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1746
1747        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1748
1749        assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1750    }
1751
1752    #[test]
1753    fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1754        let started = Instant::now() - Duration::from_secs(2);
1755        let mut scan_errors = Vec::new();
1756
1757        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1758
1759        assert_eq!(
1760            scan_errors,
1761            vec!["Processing interrupted due to timeout after 1.00 seconds"]
1762        );
1763    }
1764
1765    #[test]
1766    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1767        let text = concat!(
1768            "Reproduction and distribution of this file, with or without modification, are\n",
1769            "permitted in any medium without royalties provided the copyright notice\n",
1770            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1771        );
1772        let index = create_test_index(
1773            &[
1774                ("reproduction", 0),
1775                ("distribution", 1),
1776                ("file", 2),
1777                ("without", 3),
1778                ("modification", 4),
1779                ("permitted", 5),
1780                ("medium", 6),
1781                ("royalties", 7),
1782                ("provided", 8),
1783                ("copyright", 9),
1784                ("notice", 10),
1785                ("preserved", 11),
1786                ("offered", 12),
1787                ("warranties", 13),
1788            ],
1789            14,
1790        );
1791        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1792        let mut detection = make_detection(
1793            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1794        );
1795        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1796        detection.matches[0].license_expression = "fsf-ap".to_string();
1797        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1798        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1799        detection.matches[0].matched_text = None;
1800        detection.matches[0].start_line = LineNumber::ONE;
1801        detection.matches[0].end_line = LineNumber::new(3).unwrap();
1802        detection.matches[0].start_token = 0;
1803        detection.matches[0].end_token = query.tokens.len();
1804        detection.matches[0].coordinates =
1805            MatchCoordinates::query_region(PositionSpan::from_positions(
1806                query
1807                    .tokens
1808                    .iter()
1809                    .enumerate()
1810                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1811                    .collect::<Vec<_>>(),
1812            ));
1813        detection.identifier = Some("fsf_ap-test".to_string());
1814
1815        let (converted, clues) = convert_detection_to_model(
1816            &detection,
1817            LicenseScanOptions {
1818                include_text: true,
1819                include_text_diagnostics: true,
1820                include_diagnostics: true,
1821                unknown_licenses: false,
1822                min_score: 0,
1823            },
1824            text,
1825            Some(&query),
1826        );
1827        let converted = converted.expect("detection should convert");
1828
1829        assert!(clues.is_empty());
1830        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1831        assert_eq!(
1832            converted.matches[0].matched_text.as_deref(),
1833            Some(text.trim_end())
1834        );
1835        let diagnostics = converted.matches[0]
1836            .matched_text_diagnostics
1837            .as_deref()
1838            .expect("diagnostics should be present");
1839        assert!(diagnostics.contains('['));
1840        assert!(diagnostics.contains(']'));
1841        assert_ne!(diagnostics, text.trim_end());
1842    }
1843
1844    #[test]
1845    fn test_extract_email_url_information_skips_binary_string_text() {
1846        let mut builder = FileInfoBuilder::default();
1847        let options = TextDetectionOptions {
1848            collect_info: false,
1849            detect_packages: false,
1850            detect_application_packages: false,
1851            detect_system_packages: false,
1852            detect_packages_in_compiled: false,
1853            detect_copyrights: false,
1854            detect_generated: false,
1855            detect_emails: true,
1856            detect_urls: true,
1857            max_emails: 50,
1858            max_urls: 50,
1859            timeout_seconds: 120.0,
1860        };
1861
1862        extract_email_url_information(
1863            &mut builder,
1864            "contact 6h@fo.lwft and visit http://gmail.com/",
1865            &options,
1866            true,
1867        );
1868
1869        let file = builder
1870            .name("binary.bin".to_string())
1871            .base_name("binary".to_string())
1872            .extension(".bin".to_string())
1873            .path("binary.bin".to_string())
1874            .file_type(FileType::File)
1875            .size(1)
1876            .build()
1877            .expect("builder should produce file info");
1878
1879        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1880        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1881    }
1882
1883    #[test]
1884    fn test_extract_email_url_information_keeps_good_binary_contacts() {
1885        let mut builder = FileInfoBuilder::default();
1886        let options = TextDetectionOptions {
1887            collect_info: false,
1888            detect_packages: false,
1889            detect_application_packages: false,
1890            detect_system_packages: false,
1891            detect_packages_in_compiled: false,
1892            detect_copyrights: false,
1893            detect_generated: false,
1894            detect_emails: true,
1895            detect_urls: true,
1896            max_emails: 50,
1897            max_urls: 50,
1898            timeout_seconds: 120.0,
1899        };
1900
1901        extract_email_url_information(
1902            &mut builder,
1903            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1904            &options,
1905            true,
1906        );
1907
1908        let file = builder
1909            .name("binary.bin".to_string())
1910            .base_name("binary".to_string())
1911            .extension(".bin".to_string())
1912            .path("binary.bin".to_string())
1913            .file_type(FileType::File)
1914            .size(1)
1915            .build()
1916            .expect("builder should produce file info");
1917
1918        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1919        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1920        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1921        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1922    }
1923
1924    #[test]
1925    fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1926        let mut builder = FileInfoBuilder::default();
1927        let options = TextDetectionOptions {
1928            collect_info: false,
1929            detect_packages: false,
1930            detect_application_packages: false,
1931            detect_system_packages: false,
1932            detect_packages_in_compiled: false,
1933            detect_copyrights: false,
1934            detect_generated: false,
1935            detect_emails: true,
1936            detect_urls: false,
1937            max_emails: 2,
1938            max_urls: 50,
1939            timeout_seconds: 120.0,
1940        };
1941
1942        extract_email_url_information(
1943            &mut builder,
1944            "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1945            &options,
1946            true,
1947        );
1948
1949        let file = builder
1950            .name("binary.bin".to_string())
1951            .base_name("binary".to_string())
1952            .extension(".bin".to_string())
1953            .path("binary.bin".to_string())
1954            .file_type(FileType::File)
1955            .size(1)
1956            .build()
1957            .expect("builder should produce file info");
1958
1959        assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1960        assert_eq!(file.emails[0].email, "jakub@redhat.com");
1961        assert_eq!(file.emails[1].email, "contyk@redhat.com");
1962    }
1963
1964    #[test]
1965    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1966        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1967        assert!(!is_binary_string_copyright_candidate(gibberish));
1968    }
1969
1970    #[test]
1971    fn test_binary_string_copyright_candidate_keeps_real_notice() {
1972        let notice = "Copyright nexB and others (c) 2012";
1973        assert!(is_binary_string_copyright_candidate(notice));
1974    }
1975
1976    #[test]
1977    fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1978        assert!(!is_binary_string_copyright_candidate(
1979            "Copyright - split out libs"
1980        ));
1981    }
1982
1983    #[test]
1984    fn test_binary_string_email_candidate_rejects_gibberish() {
1985        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1986    }
1987
1988    #[test]
1989    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1990        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1991    }
1992
1993    #[test]
1994    fn test_binary_string_url_candidate_rejects_short_fake_host() {
1995        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1996    }
1997
1998    #[test]
1999    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
2000        assert!(is_binary_string_url_candidate(
2001            "https://www.gnu.org/software/coreutils/"
2002        ));
2003    }
2004
2005    #[test]
2006    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
2007        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
2008    }
2009
2010    #[test]
2011    fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2012        assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2013    }
2014
2015    #[test]
2016    fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2017        assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2018    }
2019
2020    #[test]
2021    fn test_binary_string_url_candidate_keeps_short_project_path() {
2022        assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2023    }
2024
2025    #[test]
2026    fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2027        assert!(is_binary_string_author_candidate(
2028            "Andreas Schneider <asn@redhat.com>"
2029        ));
2030    }
2031
2032    #[test]
2033    fn test_binary_string_author_candidate_rejects_gibberish() {
2034        assert!(!is_binary_string_author_candidate(
2035            "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2036        ));
2037    }
2038
2039    #[test]
2040    fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2041        assert!(!is_binary_string_author_candidate(
2042            "Developers can enable them. - revert news user back to"
2043        ));
2044    }
2045
2046    #[test]
2047    fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2048        assert_eq!(
2049            extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2050            Some("Andreas Schneider <asn@redhat.com>".to_string())
2051        );
2052    }
2053
2054    #[test]
2055    fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2056        assert_eq!(
2057            extract_named_author_from_binary_line(
2058                "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2059            ),
2060            Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2061        );
2062    }
2063
2064    #[test]
2065    fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2066        assert_eq!(
2067            extract_named_author_from_binary_line(
2068                "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2069            ),
2070            None
2071        );
2072    }
2073
2074    #[test]
2075    fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2076        assert_eq!(
2077            extract_named_author_from_binary_line(
2078                "Changes as per initial review by panemade@gmail.com"
2079            ),
2080            Some("panemade@gmail.com".to_string())
2081        );
2082    }
2083
2084    #[test]
2085    fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2086        assert!(!is_binary_string_author_candidate(
2087            "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2088        ));
2089    }
2090
2091    #[test]
2092    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2093        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2094        let text = "alpha MIT omega";
2095        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2096        let mut detection = make_detection("");
2097        detection.matches[0].coordinates =
2098            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2099        detection.matches[0].start_token = 1;
2100        detection.matches[0].end_token = 2;
2101
2102        let percentage = compute_percentage_of_license_text(&query, &[detection]);
2103
2104        assert_eq!(percentage, 33.33);
2105    }
2106
2107    #[test]
2108    fn test_scan_options_fingerprint_changes_with_license_score() {
2109        let text_options = crate::scanner::TextDetectionOptions::default();
2110        let default_fingerprint = scan_options_fingerprint(
2111            &text_options,
2112            LicenseScanOptions {
2113                min_score: 0,
2114                ..LicenseScanOptions::default()
2115            },
2116            None,
2117        );
2118        let filtered_fingerprint = scan_options_fingerprint(
2119            &text_options,
2120            LicenseScanOptions {
2121                min_score: 70,
2122                ..LicenseScanOptions::default()
2123            },
2124            None,
2125        );
2126
2127        assert_ne!(default_fingerprint, filtered_fingerprint);
2128    }
2129
2130    #[test]
2131    fn test_is_go_non_production_source_for_test_filename() {
2132        let temp_dir = tempdir().unwrap();
2133        let path = temp_dir.path().join("scanner_test.go");
2134        fs::write(&path, "package scanner\n").unwrap();
2135
2136        assert!(is_go_non_production_source(&path).unwrap());
2137    }
2138
2139    #[test]
2140    fn test_is_go_non_production_source_for_build_tag() {
2141        let temp_dir = tempdir().unwrap();
2142        let path = temp_dir.path().join("scanner.go");
2143        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2144
2145        assert!(is_go_non_production_source(&path).unwrap());
2146    }
2147
2148    #[test]
2149    fn test_is_go_non_production_source_for_regular_go_file() {
2150        let temp_dir = tempdir().unwrap();
2151        let path = temp_dir.path().join("scanner.go");
2152        fs::write(&path, "package scanner\n").unwrap();
2153
2154        assert!(!is_go_non_production_source(&path).unwrap());
2155    }
2156}